Problem Statement

A leading credit card provider CredX receives thousands of applicants every year and, experiencing increased credit loss over the last few years. The CEO wants to mitigate the credit risk.

Analytical Problem Solving Approach using CRISP-DM framework

 1. Business Objective(s)    

  1.1 Business Understanding
  CredX intends to mitigate their credit risk during acquisition by 'Finding The Right Customers'.
        
  1.2 Goals of Data Analysis
    1.2.1. Using past data of the bank's applicants identify the most important factors affecting credit risk
    1.2.2. Create strategies to mitigate the acquisition risk for new applications, by identifying right 
           customers using predictive modelling to differentiate Good Vs Bad customer

 2. Data Understanding
    3.1 Demographic Data
        Contains customer-level information like Age, Gender, Marital Status and Salary etc.
    3.2 Credit Bureau Data
        This is taken from the credit bureau, contains past Avg Credit Card Utilization, Outstanding balance 
        and 30/60/90 DPDs in last 6/12 months etc.

 3. Data Preparation
    3.1 Data Cleaning
    3.2 Data Imputation
    3.3 Feature Engineering
        3.3.1 Derived Variables
        3.3.1 Encoding / Dummy variables
        3.3.1 Weight-of-Evidence (WoE)/Information Value (IV) computation

 4. Exploratory Data Analysis
    4.1 Univariate Analysis
    4.2 Bi-variate 
    4.3 Multi-variate Analysis

 5. Feature selection
    We will use Use Weight-of-Evidence(WoE) /Information Value(IV) for feature selection

 4. Modeling Building       
       This is a binary classification problem with highly unbalanced data. We will apply following combination
    of model types, sampling techniques & cross-validation

    4.1 Demographic Data                 - Unbalanced Data  - Logistic Regression 
    4.2 Demographic & Credit Bureau Data - Unbalanced Data  - Logistic Regression
    4.3 Demographic & Credit Bureau Data - Under Sampling   - Logistic Regression - with Cross Validation
    4.4 Demographic & Credit Bureau Data - Over  Sampling   - Logistic Regression - with Cross Validation 
    4.5 Demographic & Credit Bureau Data - SMOTE Sampling   - Logistic Regression - with Cross Validation
    4.6 Demographic & Credit Bureau Data - SMOTE Sampling   - Decision Trees      - with Cross Validation
    4.7 Demographic & Credit Bureau Data - SMOTE Sampling   - Random Forest       - with Cross Validation

 5. Models Evaluation using Metrics & Final Model Selection                                                                                                                                                                   
    5.1 Accuracy, Sensitivity & Specificity
    5.2 F-Score (F1)
    5.3 Area Under Curve (AUC)
    5.4 KSStatistic
    5.5 ROC Curve
    5.6 Vintage Curve

 6. Model Deployment                                                                                                                                                                            
 7. Application scorecard Building                                                                                                                                                              
 8. Calculate scores on Rejected Population Data                                                                                                                                                                   
 9. Financial Benefit Analysis                                                                                                                         

Loading Libraries and Common Functions

# Loading Libraries
library(ggplot2)
library(dplyr)
library(outliers)
library(corrplot)
library(MASS)
library(caret)
library(ROSE)
library(car)
library(reshape2)
library(scales)
library(tidyr)
library(ROCR)
library(tibble)

# Common Functions
# For calculating Mode value of Categorical variables
ModeFunc <- function(x) {
  ux <- unique(x)
  ux[which.max(tabulate(match(x, ux)))]
}

# Function to determine the outliers in a measure
checkForOutliersDetection <- function(dt, var) {
  var_name <- eval(substitute(var),eval(dt))
  na1 <- sum(is.na(var_name))
  m1 <- mean(var_name, na.rm = T)
  outlier <- boxplot.stats(var_name)$out
  mo <- mean(outlier)
  var_name <- ifelse(var_name %in% outlier, NA, var_name)
  na2 <- sum(is.na(var_name))
  cat("Outliers identified:", na2 - na1, "n")
}

# For plotting correlation matrix
plot_correlationMatrix <- function (data, features) {
  
  melted_cor_matrix <- melt(round(cor(data [ names(data) 
                                             %in% 
                                               features],
                                      use="complete.obs"),2))
  
  ggplot(data = melted_cor_matrix, aes(x=Var1, y=Var2, fill=value, label=value)) +  
    geom_tile()    + 
    geom_text()  + 
    xlab('')   + 
    ylab('') + 
    theme_minimal() + 
    theme(axis.text.x = element_text(size=10, 
                                     hjust=-0.08, 
                                     angle= -35 ))
}

Loading Data

# Loading Demographic Data
demographic_data.original <-read.csv("Demographic data.csv",
                            header = TRUE,
                            stringsAsFactors = FALSE)

# Loading Credit Bureau Data
creditbureau_data.original <- read.csv("Credit Bureau data.csv",
                        header = TRUE,
                        stringsAsFactors = FALSE)

#  Data Cleaning 
#    1. Remove Duplicate Records
#    2. Separate Rejected Applications from Data Analysis & Model Building
#    3. Remove Invalid / Incorrect Values Records
#    4. Missing Values Treatment 
#    5. Outlier Treatment

#  checking for Total row count
# 71295
nrow(demographic_data.original)
## [1] 71295
# 71295
nrow(creditbureau_data.original)
## [1] 71295

Remove Duplicate Records

#  Checking for Duplicate records 
# 71292 - 3 Duplicate records exist
length(unique(demographic_data.original$Application.ID))
## [1] 71292
# 71292 - 3 Duplicate records exist
length(unique(creditbureau_data.original$Application.ID))
## [1] 71292
#  Remove duplicate records
demographic_data <- demographic_data.original[!duplicated(demographic_data.original$Application.ID),]
creditbureau_data <- creditbureau_data.original[!duplicated(creditbureau_data.original$Application.ID),]

summary(demographic_data)
##  Application.ID           Age           Gender         
##  Min.   :1.004e+05   Min.   :-3.00   Length:71292      
##  1st Qu.:2.484e+08   1st Qu.:37.00   Class :character  
##  Median :4.976e+08   Median :45.00   Mode  :character  
##  Mean   :4.990e+08   Mean   :44.94                     
##  3rd Qu.:7.496e+08   3rd Qu.:53.00                     
##  Max.   :1.000e+09   Max.   :65.00                     
##                                                        
##  Marital.Status..at.the.time.of.application. No.of.dependents
##  Length:71292                                Min.   :1.000   
##  Class :character                            1st Qu.:2.000   
##  Mode  :character                            Median :3.000   
##                                              Mean   :2.865   
##                                              3rd Qu.:4.000   
##                                              Max.   :5.000   
##                                              NA's   :3       
##      Income      Education          Profession        Type.of.residence 
##  Min.   :-0.5   Length:71292       Length:71292       Length:71292      
##  1st Qu.:14.0   Class :character   Class :character   Class :character  
##  Median :27.0   Mode  :character   Mode  :character   Mode  :character  
##  Mean   :27.2                                                           
##  3rd Qu.:40.0                                                           
##  Max.   :60.0                                                           
##                                                                         
##  No.of.months.in.current.residence No.of.months.in.current.company
##  Min.   :  6.00                    Min.   :  3.00                 
##  1st Qu.:  6.00                    1st Qu.: 16.00                 
##  Median : 11.00                    Median : 34.00                 
##  Mean   : 34.56                    Mean   : 33.96                 
##  3rd Qu.: 60.00                    3rd Qu.: 51.00                 
##  Max.   :126.00                    Max.   :133.00                 
##                                                                   
##  Performance.Tag 
##  Min.   :0.0000  
##  1st Qu.:0.0000  
##  Median :0.0000  
##  Mean   :0.0422  
##  3rd Qu.:0.0000  
##  Max.   :1.0000  
##  NA's   :1425
summary(creditbureau_data)
##  Application.ID      No.of.times.90.DPD.or.worse.in.last.6.months
##  Min.   :1.004e+05   Min.   :0.0000                              
##  1st Qu.:2.484e+08   1st Qu.:0.0000                              
##  Median :4.976e+08   Median :0.0000                              
##  Mean   :4.990e+08   Mean   :0.2703                              
##  3rd Qu.:7.496e+08   3rd Qu.:0.0000                              
##  Max.   :1.000e+09   Max.   :3.0000                              
##                                                                  
##  No.of.times.60.DPD.or.worse.in.last.6.months
##  Min.   :0.0000                              
##  1st Qu.:0.0000                              
##  Median :0.0000                              
##  Mean   :0.4305                              
##  3rd Qu.:1.0000                              
##  Max.   :5.0000                              
##                                              
##  No.of.times.30.DPD.or.worse.in.last.6.months
##  Min.   :0.0000                              
##  1st Qu.:0.0000                              
##  Median :0.0000                              
##  Mean   :0.5772                              
##  3rd Qu.:1.0000                              
##  Max.   :7.0000                              
##                                              
##  No.of.times.90.DPD.or.worse.in.last.12.months
##  Min.   :0.0000                               
##  1st Qu.:0.0000                               
##  Median :0.0000                               
##  Mean   :0.4503                               
##  3rd Qu.:1.0000                               
##  Max.   :5.0000                               
##                                               
##  No.of.times.60.DPD.or.worse.in.last.12.months
##  Min.   :0.0000                               
##  1st Qu.:0.0000                               
##  Median :0.0000                               
##  Mean   :0.6555                               
##  3rd Qu.:1.0000                               
##  Max.   :7.0000                               
##                                               
##  No.of.times.30.DPD.or.worse.in.last.12.months
##  Min.   :0.0000                               
##  1st Qu.:0.0000                               
##  Median :0.0000                               
##  Mean   :0.8009                               
##  3rd Qu.:1.0000                               
##  Max.   :9.0000                               
##                                               
##  Avgas.CC.Utilization.in.last.12.months
##  Min.   :  0.0                         
##  1st Qu.:  8.0                         
##  Median : 15.0                         
##  Mean   : 29.7                         
##  3rd Qu.: 46.0                         
##  Max.   :113.0                         
##  NA's   :1058                          
##  No.of.trades.opened.in.last.6.months
##  Min.   : 0.000                      
##  1st Qu.: 1.000                      
##  Median : 2.000                      
##  Mean   : 2.298                      
##  3rd Qu.: 3.000                      
##  Max.   :12.000                      
##  NA's   :1                           
##  No.of.trades.opened.in.last.12.months
##  Min.   : 0.000                       
##  1st Qu.: 2.000                       
##  Median : 5.000                       
##  Mean   : 5.827                       
##  3rd Qu.: 9.000                       
##  Max.   :28.000                       
##                                       
##  No.of.PL.trades.opened.in.last.6.months
##  Min.   :0.000                          
##  1st Qu.:0.000                          
##  Median :1.000                          
##  Mean   :1.207                          
##  3rd Qu.:2.000                          
##  Max.   :6.000                          
##                                         
##  No.of.PL.trades.opened.in.last.12.months
##  Min.   : 0.000                          
##  1st Qu.: 0.000                          
##  Median : 2.000                          
##  Mean   : 2.397                          
##  3rd Qu.: 4.000                          
##  Max.   :12.000                          
##                                          
##  No.of.Inquiries.in.last.6.months..excluding.home...auto.loans.
##  Min.   : 0.000                                                
##  1st Qu.: 0.000                                                
##  Median : 1.000                                                
##  Mean   : 1.764                                                
##  3rd Qu.: 3.000                                                
##  Max.   :10.000                                                
##                                                                
##  No.of.Inquiries.in.last.12.months..excluding.home...auto.loans.
##  Min.   : 0.000                                                 
##  1st Qu.: 0.000                                                 
##  Median : 3.000                                                 
##  Mean   : 3.535                                                 
##  3rd Qu.: 5.000                                                 
##  Max.   :20.000                                                 
##                                                                 
##  Presence.of.open.home.loan Outstanding.Balance Total.No.of.Trades
##  Min.   :0.0000             Min.   :      0     Min.   : 0.000    
##  1st Qu.:0.0000             1st Qu.: 211537     1st Qu.: 3.000    
##  Median :0.0000             Median : 774994     Median : 6.000    
##  Mean   :0.2564             Mean   :1249195     Mean   : 8.187    
##  3rd Qu.:1.0000             3rd Qu.:2920797     3rd Qu.:10.000    
##  Max.   :1.0000             Max.   :5218801     Max.   :44.000    
##  NA's   :272                NA's   :272                           
##  Presence.of.open.auto.loan Performance.Tag 
##  Min.   :0.00000            Min.   :0.0000  
##  1st Qu.:0.00000            1st Qu.:0.0000  
##  Median :0.00000            Median :0.0000  
##  Mean   :0.08462            Mean   :0.0422  
##  3rd Qu.:0.00000            3rd Qu.:0.0000  
##  Max.   :1.00000            Max.   :1.0000  
##                             NA's   :1425
# Checking NA Values
sapply(demographic_data, function(x) sum(is.na(x) | is.null(x)))
##                              Application.ID 
##                                           0 
##                                         Age 
##                                           0 
##                                      Gender 
##                                           0 
## Marital.Status..at.the.time.of.application. 
##                                           0 
##                            No.of.dependents 
##                                           3 
##                                      Income 
##                                           0 
##                                   Education 
##                                           0 
##                                  Profession 
##                                           0 
##                           Type.of.residence 
##                                           0 
##           No.of.months.in.current.residence 
##                                           0 
##             No.of.months.in.current.company 
##                                           0 
##                             Performance.Tag 
##                                        1425
sapply(creditbureau_data, function(x) sum(is.na(x) | is.null(x)))
##                                                  Application.ID 
##                                                               0 
##                    No.of.times.90.DPD.or.worse.in.last.6.months 
##                                                               0 
##                    No.of.times.60.DPD.or.worse.in.last.6.months 
##                                                               0 
##                    No.of.times.30.DPD.or.worse.in.last.6.months 
##                                                               0 
##                   No.of.times.90.DPD.or.worse.in.last.12.months 
##                                                               0 
##                   No.of.times.60.DPD.or.worse.in.last.12.months 
##                                                               0 
##                   No.of.times.30.DPD.or.worse.in.last.12.months 
##                                                               0 
##                          Avgas.CC.Utilization.in.last.12.months 
##                                                            1058 
##                            No.of.trades.opened.in.last.6.months 
##                                                               1 
##                           No.of.trades.opened.in.last.12.months 
##                                                               0 
##                         No.of.PL.trades.opened.in.last.6.months 
##                                                               0 
##                        No.of.PL.trades.opened.in.last.12.months 
##                                                               0 
##  No.of.Inquiries.in.last.6.months..excluding.home...auto.loans. 
##                                                               0 
## No.of.Inquiries.in.last.12.months..excluding.home...auto.loans. 
##                                                               0 
##                                      Presence.of.open.home.loan 
##                                                             272 
##                                             Outstanding.Balance 
##                                                             272 
##                                              Total.No.of.Trades 
##                                                               0 
##                                      Presence.of.open.auto.loan 
##                                                               0 
##                                                 Performance.Tag 
##                                                            1425
# Making the Performance Tag as a factor variable as new feature
demographic_data$Performance <- as.factor(demographic_data$Performance.Tag)

# Validate Performance Tag across Application IDs in both data-sets
# [1] Application.ID  Performance.Tag
# <0 rows> (or 0-length row.names)
setdiff(dplyr::select(demographic_data, Application.ID, Performance.Tag),dplyr::select(creditbureau_data, Application.ID, Performance.Tag))
## [1] Application.ID  Performance.Tag
## <0 rows> (or 0-length row.names)
customer_master_data <- merge(demographic_data, creditbureau_data, by="Application.ID")
View(customer_master_data)

# Remove Performance.Tag.x and Performance.Tag.y
customer_master_data <- customer_master_data[,-12]
customer_master_data <- customer_master_data[,-30]

Remove Rejected Applications Records

# Check distribution of Classes and also NA values
#    0     1  NA's 
#66920  2947  1425 
summary(customer_master_data$Performance)
##     0     1  NA's 
## 66920  2947  1425
# Also classes (1 and 0) distribution, is highly unbalanced
# [1] 1.9992
((1425/71276)*100)
## [1] 1.99927
# The records with Performance = NA are treated as Applications rejected. 
# There are approximately 2% and we are ignoring them in the modeling.
# Separating 1425 records Performance = NA
rejected_records <- customer_master_data[which(is.na(customer_master_data$Performance)),]

# [1] 1425
nrow(rejected_records)
## [1] 1425
# Retaining records that have Performance = 1 or 0
customer_master_data <- customer_master_data[-which(is.na(customer_master_data$Performance)),]

# Data Quality Checks for Rejected Population
# [1] 69867
length(customer_master_data$Performance)
## [1] 69867
# [1] 1425
length(rejected_records$Application.ID)
## [1] 1425
# [1] 1425
# No Duplicate records in Rejected population
length(unique(rejected_records$Application.ID))
## [1] 1425
# No Records with Age <18
sort(unique(rejected_records$Age), decreasing = FALSE)
##  [1] 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44
## [24] 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65
# [1] 0
sum(is.na(rejected_records$No.of.months.in.current.company))
## [1] 0
# [1] 0
sum(is.na(rejected_records$No.of.months.in.current.residence))
## [1] 0
# [1] 0
sum(is.na(rejected_records$No.of.times.30.DPD.or.worse.in.last.6.months))
## [1] 0
# [1] 0
sum(is.na(rejected_records$No.of.trades.opened.in.last.12.months))
## [1] 0
# [1] 1460
sum(is.na(rejected_records))
## [1] 1460
# [1] 1425
sum(is.na(rejected_records$Performance))
## [1] 1425

Removing Records with Invalid Data

# Demographic Data contains 65 records with Age < 18 which is an invalid value
# 65
length(which(customer_master_data$Age < 18))
## [1] 65
# 20
length(which(customer_master_data$Age <= 0))
## [1] 20
# 65 records exists with Age < 18, with only 1 as defaulter
dplyr::select(customer_master_data[which(customer_master_data$Age < 18),], Age, Application.ID, Performance)
##       Age Application.ID Performance
## 913    17       13167456           0
## 1111   17       15988053           1
## 6462   17       89770640           0
## 9315   16      130064793           0
## 9523   16      133256231           0
## 9634   16      134951209           0
## 10705  15      149528904           0
## 18401  15      256752828           0
## 22053  15      307736934           0
## 23024  17      321167182           0
## 23098  15      322139302           0
## 23371   0      325992471           0
## 23424  16      326621134           0
## 23512  17      327723491           0
## 23760   0      331033631           0
## 24313  17      339076142           0
## 26576  15      371030845           0
## 27205   0      380153306           0
## 27270  17      381342630           0
## 28787  17      403119279           0
## 29283  15      410153337           0
## 30880  17      431350709           0
## 31572  15      440605214           0
## 31606   0      441104387           0
## 33127   0      463622314           0
## 34569  17      483332934           0
## 35131  15      490791674           0
## 35289   0      492869740           0
## 36177  17      505345414           0
## 38231  16      534251279           0
## 38304  16      535285757           0
## 38974  16      544978628           0
## 39594  17      553384326           0
## 40638  17      568773679           0
## 41760  16      584758872           0
## 41815   0      585528536           0
## 42014  16      588417745           0
## 42155   0      590303560           0
## 43015  15      601969470           0
## 44439  17      622880517           0
## 45269  -3      634180637           0
## 47543   0      666398799           0
## 50066  15      701653071           0
## 50375  17      705529178           0
## 51200  16      717434244           0
## 53190  17      745462418           0
## 55160  17      773171568           0
## 55897   0      783195548           0
## 55957  17      783913343           0
## 56410   0      790296430           0
## 57503  17      805308826           0
## 57793   0      809393409           0
## 58025  17      812365997           0
## 60158  17      842745102           0
## 60649   0      848965588           0
## 60949   0      852875477           0
## 61664   0      862922520           0
## 64920  15      910506404           0
## 65339   0      915848904           0
## 65636  16      920083644           0
## 66567   0      932483616           0
## 68786  15      963813463           0
## 68833   0      964427213           0
## 69604  17      975085624           0
## 69820   0      977872820           0
# Removing records with Age < 18
customer_master_data <- customer_master_data[-which(customer_master_data$Age < 18),]
# [1] 69802
nrow(customer_master_data)
## [1] 69802

Missing Values - Data Imputation

# Replacing missing values which are small in number, using simple and straight-forward techniques e.g. Mode etc
#
# No.of.trades.opened.in.last.6.months
#   - Only 1 missing value exist
#   - Make NA value as '0'
customer_master_data[which(is.na(customer_master_data$No.of.trades.opened.in.last.6.months)), 
                     "No.of.trades.opened.in.last.6.months"] <- 0
# Verify values
length(which(is.na(customer_master_data$No.of.trades.opened.in.last.6.months)))
## [1] 0
# No.of.Dependents
#   - Only 3 missing values exist
#   - Make NA values for No.of.Dependents as '0'
customer_master_data [which(is.na(customer_master_data$No.of.dependents)), 
                      "No.of.dependents"] <- 0

# Verify values 
# [1] 0
length(which(is.na(customer_master_data$No.of.dependents)))
## [1] 0
# Gender 
#   - Only 2 missing values
#   - Replacing with Mode value
summary(factor(customer_master_data$Gender))
##           F     M 
##     1 16490 53311
customer_master_data[-which(customer_master_data$Gender %in% c("F","M")), 
                     "Gender"] <- ModeFunc(customer_master_data$Gender)


# Marital Status
# Check for invalid i.e. NA values for Marital Status
nrow(customer_master_data[ -which(customer_master_data$Marital.Status..at.the.time.of.application. 
                                  %in% 
                                  c("Married","Single")),])
## [1] 5
# Imputing with Mode i.e. "Married"
customer_master_data[-which( customer_master_data$Marital.Status..at.the.time.of.application. 
                             %in% 
                             c("Married","Single")), 
                     "Marital.Status..at.the.time.of.application."] <- ModeFunc(customer_master_data$Marital.Status..at.the.time.of.application.)

# Profession
#     - Only 12 values are missing
#     - Impute with Mode ()
summary(factor(customer_master_data$Profession))
##             SAL      SE SE_PROF 
##      12   39639   13915   16236
# Imputing with Mode Value i.e. "SAL"
customer_master_data[-which(customer_master_data$Profession 
                            %in% 
                              c('SAL','SE','SE_PROF')), 
                     "Profession"] <- ModeFunc(customer_master_data$Profession)

# Type of residence
#     - 8 values are missing
#     - Impute with Mode ()
summary(factor(customer_master_data$Type.of.residence))
##                        Company provided Living with Parents 
##                   8                1601                1767 
##              Others               Owned              Rented 
##                 198               13986               52242
# Imputing with Mode value i.e. "Rented"
customer_master_data[-which(customer_master_data$Type.of.residence 
                            %in% 
                              c('Company provided',
                                'Living with Parents',
                                'Others',
                                'Owned',
                                'Rented')), 
                     "Type.of.residence"] <- ModeFunc(customer_master_data$Type.of.residence)


# Number of months in current residence
length(which(customer_master_data$No.of.months.in.current.residence <= 0))
## [1] 0
summary(customer_master_data$No.of.months.in.current.residence)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    6.00    6.00   10.00   34.57   61.00  126.00
# Number of months in current company
# imputing age less than or equal to 0
length(which(customer_master_data$No.of.months.in.current.company <= 0))
## [1] 0
summary(customer_master_data$No.of.months.in.current.company)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    3.00   17.00   34.00   34.19   51.00  133.00
# Education
# 118 records are with NA values - Need Imputation with WoE
nrow(customer_master_data[-which(customer_master_data$Education %in% c('Bachelor','Masters','Others','Phd','Professional')),])
## [1] 118
# Income 
# 106 records are with NA values - Need Imputation with WoE
length(which(customer_master_data$Income <= 0))
## [1] 106
customer_master_data$Income_imputed <- customer_master_data$Income
customer_master_data[which(customer_master_data$Income_imputed <=0), "Income_imputed"] <- NA

rejected_records$Income_imputed <- rejected_records$Income
rejected_records[which(rejected_records$Income_imputed <=0), "Income_imputed"] <- NA

View(customer_master_data)

Outlier Treatment

Note - Outliers removal is not required to perform on all measures, as it is not impacting any results Following are the variables with outliers.

Outstanding.Balance Income Avgas.CC.Utilization.in.last.12.months Total.No.of.Trades No.of.trades.opened.in.last.12.months No.of.Inquiries.in.last.12.months..excluding.home…auto.loans. No.of.PL.trades.opened.in.last.12.months

checkForOutliersDetection(customer_master_data, Outstanding.Balance) #function call
## Outliers identified: 0 n
# Outliers identified: 0
summary(customer_master_data$Outstanding.Balance)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##       0  208494  774191 1253107 2925974 5218801     272
hist(customer_master_data$Outstanding.Balance, main = "Histogram of Outstanding.Balance")

# Income
# Outliers identified: 0
checkForOutliersDetection(customer_master_data, Income) 
## Outliers identified: 0 n
hist(customer_master_data$Income, main = "Histogram of Income")

# Avgas.CC.Utilization.in.last.12.months
# Outliers identified: 3624
checkForOutliersDetection(customer_master_data, Avgas.CC.Utilization.in.last.12.months)
## Outliers identified: 3624 n
hist(customer_master_data$Avgas.CC.Utilization.in.last.12.months, 
     main = "Histogram of Avgas.CC.Utilization.in.last.12.months")

outlier_range<-1.5*IQR(customer_master_data$Avgas.CC.Utilization.in.last.12.months, 
                       na.rm = T) #1843 outlier
upper_whisker=unname(quantile(customer_master_data$Avgas.CC.Utilization.in.last.12.months,
                              0.95, 
                              na.rm = T))+outlier_range
lower_whisker=unname(quantile(customer_master_data$Avgas.CC.Utilization.in.last.12.months,
                              0.05, 
                              na.rm = T))-outlier_range
customer_master_data2 <- customer_master_data[which(
  (customer_master_data$Avgas.CC.Utilization.in.last.12.months>=upper_whisker | 
  customer_master_data$Avgas.CC.Utilization.in.last.12.months<=lower_whisker) ==FALSE),]

summary(customer_master_data$Performance)
##     0     1 
## 66856  2946
summary(customer_master_data2$Performance)
##     0     1 
## 65886  2898
# Total.No.of.Trades
# Outliers identified: 6818
checkForOutliersDetection(customer_master_data2, Total.No.of.Trades)
## Outliers identified: 6818 n
hist(customer_master_data2$Total.No.of.Trades, main = "Histogram of Total.No.of.Trades")

outlier_range<-1.5*IQR(customer_master_data2$Total.No.of.Trades, na.rm = T) #1843 outlier
upper_whisker=unname(quantile(customer_master_data2$Total.No.of.Trades,
                              0.95, 
                              na.rm = T))+outlier_range
lower_whisker=unname(quantile(customer_master_data2$Total.No.of.Trades,
                              0.05, 
                              na.rm = T))-outlier_range
customer_master_data3 <- customer_master_data2[which(
  (customer_master_data2$Total.No.of.Trades>=upper_whisker | 
  customer_master_data2$Total.No.of.Trades<=lower_whisker)==FALSE),]

summary(customer_master_data3$Performance)
##     0     1 
## 65779  2896
# Discarding the Outlier treatment as models are performing better with having Outliers
# It is not always required to remove outliers, as they carry on important patters & trends
# 
# customer_master_data <- customer_master_data3

Feature Engineering - Derived Variables

customer_master_data$Income_bin <- as.factor(cut(customer_master_data$Income,
                                                    breaks = c(-Inf,1,11,21,31,40,Inf),
                                                    labels=c("<0","1-10","11-20","21-30","31-40",">40"), 
                                                           ordered = TRUE))

customer_master_data$age_bin <- as.factor(cut(customer_master_data$Age,
                                    breaks=c(-Inf,35,46,55,Inf),
                                    labels=c("<35","35-45","46-55",">55"), ordered = TRUE,
                                    right = FALSE))

customer_master_data$current_residence_bin <- as.factor(cut(customer_master_data$No.of.months.in.current.residence,
                                                            breaks = c(-Inf,13,25,
                                                                       37,49,61,
                                                                       73,85,97,
                                                                       109,121,Inf),
                                                            labels=c("<12","13-24",
                                                                     "25-36","37-48",
                                                                      "49-60","61-72",
                                                                      "73-84","85-96",
                                                                      "97-108","109-120",
                                                                     ">120"),
                                                            ordered = TRUE))

customer_master_data$current_company_bin <- as.factor(cut(customer_master_data$No.of.months.in.current.company, 
                                                        breaks = c(1,12, 24, 36, 48, 
                                                                   60, 72,84,96, 
                                                                   108,120,136),
                                                            ordered = TRUE))

Feature Engineering - Weight Of Evidence(WoE)/ Information Value(IV) Analysis

The data contains status of customer performance through variable Performance Tag with value 1 representing Default and 0 for Non-Default. We leverage R Information package for computing the Information Values (IV). But this package interprets 1 value for Good which is contradictory to business case here as Performance Tag value. So, we need another variable Performance Tag for IV with values 1 and 0 replaced with 0 and 1 respectively.

Reference table for Variable Importance Analysis based on Information Value(IV)

     Information Value(IV)   Predictive Power
       
     
                     <0.02 -> Useless for Prediction
                0.02 - 0.1 -> Weak Predictor
                0.1  - 0.3 -> Medium Predictor
                0.3  - 0.5 -> Strong Predictor
                      >0.5 -> Suspecious
# Using WoE for both Variable Importance and also Missing Values

library(Information)
str(customer_master_data$Performance)
##  Factor w/ 2 levels "0","1": 1 2 1 1 1 1 1 1 1 1 ...
# Performance Tag for IV with values 1 and 0 replaced with 0 and 1 respectively
customer_master_data$Performance.Tag_forIV <- ifelse(customer_master_data$Performance == 0, 1, 0)

IV <- create_infotables(data=customer_master_data, 
                        y="Performance.Tag_forIV",
                        bins = 10, 
                        parallel = FALSE)

IV_Value = data.frame(IV$Summary)

# Printing values >=0.02
arrange(IV_Value [IV_Value$IV >=0.02, ], desc(IV))
##                                                           Variable
## 1                           Avgas.CC.Utilization.in.last.12.months
## 2                            No.of.trades.opened.in.last.12.months
## 3                         No.of.PL.trades.opened.in.last.12.months
## 4  No.of.Inquiries.in.last.12.months..excluding.home...auto.loans.
## 5                                              Outstanding.Balance
## 6                     No.of.times.30.DPD.or.worse.in.last.6.months
## 7                                               Total.No.of.Trades
## 8                          No.of.PL.trades.opened.in.last.6.months
## 9                    No.of.times.90.DPD.or.worse.in.last.12.months
## 10                    No.of.times.60.DPD.or.worse.in.last.6.months
## 11  No.of.Inquiries.in.last.6.months..excluding.home...auto.loans.
## 12                   No.of.times.30.DPD.or.worse.in.last.12.months
## 13                            No.of.trades.opened.in.last.6.months
## 14                   No.of.times.60.DPD.or.worse.in.last.12.months
## 15                    No.of.times.90.DPD.or.worse.in.last.6.months
## 16                               No.of.months.in.current.residence
## 17                                           current_residence_bin
## 18                                                  Income_imputed
## 19                                                          Income
## 20                                                      Income_bin
## 21                                 No.of.months.in.current.company
##            IV
## 1  0.31015860
## 2  0.29827712
## 3  0.29604052
## 4  0.29560438
## 5  0.24604217
## 6  0.24167952
## 7  0.23713984
## 8  0.21973098
## 9  0.21393775
## 10 0.20592613
## 11 0.20523987
## 12 0.19848248
## 13 0.18597050
## 14 0.18563797
## 15 0.16016368
## 16 0.07913990
## 17 0.06080075
## 18 0.04393392
## 19 0.04255551
## 20 0.04007508
## 21 0.02175181

Data Imputation based on WoE Analysis Approach

# Two New columns are being added for every variable whose WoE analysis being done
# 1. Add WoE values for the variable as new feature (e.g. <variable>_WoE) and retain original feature/values in master data frame
# 2. For imputation to NA/Missing/Incorrect values BIN find other BIN/Bucket with nearest WoE value close enough, 
#    i.e. add new variable as <variable>_imputed
#    2a) For a continuous variable - when nearest match found, use median value of matching bucket
#    2b) For a continuous variable - when nearest match NOT found, use median for whole continuous variable
#    2c) For a categorical variable - when nearest match found, use Mode value of matching bucket
#    2d) For a categorical variable - when nearest match NOT found, use Mode value of whole categorical variable


# 
# WoE Analysis for Education
Education_bin <- data.frame(IV$Tables$Education)
print(IV$Tables$Education)
##      Education     N     Percent          WOE           IV
## 1                118 0.001690496 -0.004142828 2.906912e-08
## 2     Bachelor 17282 0.247586029 -0.016495934 6.791224e-05
## 3      Masters 23465 0.336165153 -0.008043166 8.973987e-05
## 4       Others   119 0.001704822 -0.492004075 6.089329e-04
## 5          Phd  4455 0.063823386  0.028258493 6.592444e-04
## 6 Professional 24363 0.349030114  0.017649959 7.671004e-04
plot_infotables(IV,"Education")

# Creating a column with WoE for Education
customer_master_data$Education_WoE <- ifelse(customer_master_data$Education=="Bachelor",
                                             IV$Tables$Education$WOE[2],
                                             ifelse(customer_master_data$Education=="Masters",
                                                    IV$Tables$Education$WOE[3],
                                                    ifelse(customer_master_data$Education=="Others",
                                                           IV$Tables$Education$WOE[4],
                                                           ifelse(customer_master_data$Education=="Phd",
                                                                  IV$Tables$Education$WOE[5],
                                                                  ifelse(customer_master_data$Education=="Professional",
                                                                         IV$Tables$Education$WOE[6],
                                                                         IV$Tables$Education$WOE[1])))))
unique(customer_master_data$Education_WoE)
## [1]  0.017649959  0.028258493 -0.008043166 -0.016495934 -0.004142828
## [6] -0.492004075
# Replace 'NA' (WoE = -0.004112913) values with  Masters (nearest WoE = -0.008057761)
customer_master_data$Education_imputed <- ifelse(customer_master_data$Education == "",
                                                 "Masters",
                                                 customer_master_data$Education)


# WoE Analysis for Income
Income.bin <- data.frame(IV$Tables$Income_imputed)

print(Income.bin)
##    Income_imputed    N     Percent         WOE          IV
## 1              NA  106 0.001518581  0.82915098 0.000726913
## 2           [1,5] 6222 0.089137847 -0.31412051 0.010901968
## 3          [6,10] 6508 0.093235151 -0.27545857 0.018939175
## 4         [11,16] 7916 0.113406493 -0.06639700 0.019454612
## 5         [17,21] 6795 0.097346781 -0.08141794 0.020124511
## 6         [22,26] 6821 0.097719263 -0.02551861 0.020188895
## 7         [27,31] 6807 0.097518696 -0.07956922 0.020829301
## 8         [32,36] 6820 0.097704937  0.15505200 0.023018462
## 9         [37,41] 6711 0.096143377  0.26716146 0.029100697
## 10        [42,48] 7780 0.111458124  0.17694798 0.032321235
## 11        [49,60] 7316 0.104810750  0.36098054 0.043933921
# # Replace '[-0.5,5]' (WoE = 0.30218631) values with median of [6,10] (WoE = -0.27545857)
# customer_master_data$Income_imputed <- ifelse(is.na(customer_master_data$Income_imputed),
#                                                            median(customer_master_data$Income_imputed, na.rm = TRUE),
#                                                            customer_master_data$Income_imputed)

# creating a Woe column for Income for master dataframe
customer_master_data$Income_imputed_WoE <- ifelse(is.na(customer_master_data$Income_imputed),
   IV$Tables$Income_imputed$WOE[1],
   ifelse(between(customer_master_data$Income_imputed,1,5),
         IV$Tables$Income_imputed$WOE[2],
         ifelse(between(customer_master_data$Income_imputed,6,10), 
                IV$Tables$Income_imputed$WOE[3],
                ifelse(between(customer_master_data$Income_imputed,11,16),
                       IV$Tables$Income_imputed$WOE[4],
                       ifelse(between(customer_master_data$Income_imputed,17,21),
                              IV$Tables$Income_imputed$WOE[5],
                              ifelse(between(customer_master_data$Income_imputed,22,26),
                                     IV$Tables$Income_imputed$WOE[6],
                                     ifelse(between(customer_master_data$Income_imputed,27,31),
                                            IV$Tables$Income_imputed$WOE[7],
                                            ifelse(between(customer_master_data$Income_imputed,32,36),
            IV$Tables$Income_imputed$WOE[8],
            ifelse(between(customer_master_data$Income_imputed,37,41),
                   IV$Tables$Income_imputed$WOE[9],
                   ifelse(between(customer_master_data$Income_imputed,42,48),
                          IV$Tables$Income_imputed$WOE[10],
                          IV$Tables$Income_imputed$WOE[11]))))))))))

unique(customer_master_data$Income_imputed_WoE)
##  [1]  0.15505200 -0.06639700 -0.07956922  0.36098054  0.17694798
##  [6] -0.31412051  0.26716146 -0.27545857 -0.08141794 -0.02551861
## [11]  0.82915098
# [1]  0.15505200 -0.06639700 -0.07956922  0.36098054  0.17694798 -0.31412051  0.26716146 -0.27545857 -0.08141794 -0.02551861  0.82915098

# creating a Woe column for Income for rejected records
rejected_records$Income_imputed_WoE <- ifelse(is.na(rejected_records$Income_imputed),
           IV$Tables$Income_imputed$WOE[1],
           ifelse(between(rejected_records$Income_imputed,1,5),
                  IV$Tables$Income_imputed$WOE[2],
                  ifelse(between(rejected_records$Income_imputed,6,10), 
                         IV$Tables$Income_imputed$WOE[3],
                         ifelse(between(rejected_records$Income_imputed,11,16),
                                IV$Tables$Income_imputed$WOE[4],
                                ifelse(between(rejected_records$Income_imputed,17,21),
                                       IV$Tables$Income_imputed$WOE[5],
                                       ifelse(between(rejected_records$Income_imputed,22,26),
       IV$Tables$Income_imputed$WOE[6],
       ifelse(between(rejected_records$Income_imputed,27,31),
              IV$Tables$Income_imputed$WOE[7],
              ifelse(between(rejected_records$Income_imputed,32,36),
                     IV$Tables$Income_imputed$WOE[8],
                     ifelse(between(rejected_records$Income_imputed,37,41),
                            IV$Tables$Income_imputed$WOE[9],
                          ifelse(between(rejected_records$Income_imputed,42,48),
                                   IV$Tables$Income_imputed$WOE[10],
                                   IV$Tables$Income_imputed$WOE[11]))))))))))

unique(rejected_records$Income_imputed_WoE)
##  [1] -0.27545857 -0.02551861 -0.07956922 -0.08141794 -0.31412051
##  [6]  0.26716146  0.36098054 -0.06639700  0.15505200  0.17694798
# Replace NA with median for master dataframe
customer_master_data[which(is.na(customer_master_data$Income_imputed)), 
                     "Income_imputed"] <- median(customer_master_data$Income_imputed,
                                                 na.rm = TRUE)
# Replace NA with median for rejected records dataframe
rejected_records$Income_imputed <- rejected_records$Income
rejected_records[which(is.na(rejected_records$Income_imputed)), 
                 "Income_imputed"] <- median(rejected_records$Income_imputed,
                                             na.rm = TRUE)
plot_infotables(IV,"Income_imputed")

# WoE Analysis for Presence.of.open.home.loan
# Creating a WoE column for Presence.of.open.home.loan
Presence.of.open.home.loan.bin <- data.frame(IV$Tables$Presence.of.open.home.loan)
print(IV$Tables$Presence.of.open.home.loan)
##   Presence.of.open.home.loan     N     Percent         WOE           IV
## 1                         NA   272 0.003896736  0.37441483 0.0004617429
## 2                      [0,0] 51487 0.737614968 -0.07384043 0.0046222731
## 3                      [1,1] 18043 0.258488295  0.23737694 0.0177045058
plot_infotables(IV,"Presence.of.open.home.loan")

customer_master_data$Presence.of.open.home.loan_WoE <- ifelse(is.na(customer_master_data$Presence.of.open.home.loan),
                       IV$Tables$Presence.of.open.home.loan$WOE[1],
                       ifelse(customer_master_data$Presence.of.open.home.loan==1,
                              IV$Tables$Presence.of.open.home.loan$WOE[3],
                              IV$Tables$Presence.of.open.home.loan$WOE[2]))
# Replace 'NA' (WoE = 0.37444474) values with 1 (nearest WoE = 0.23740686)
customer_master_data$Presence.of.open.home.loan_imputed <- ifelse(is.na(customer_master_data$Presence.of.open.home.loan),
                           1,
                           customer_master_data$Presence.of.open.home.loan)



# WoE Analysis for Outstanding.Balance
# Creating a WoE column for Outstanding.Balance
Outstanding.Balance.bin <- data.frame(IV$Tables$Outstanding.Balance)
print(IV$Tables$Outstanding.Balance)
##    Outstanding.Balance    N     Percent        WOE           IV
## 1                   NA  272 0.003896736  0.3744148 0.0004617429
## 2             [0,6851] 6952 0.099596000  0.7700212 0.0425995861
## 3         [6852,25590] 6953 0.099610326  0.9199346 0.0991493847
## 4       [25600,386878] 6953 0.099610326  0.1340639 0.1008337943
## 5      [386879,585389] 6953 0.099610326 -0.2547105 0.1081048290
## 6      [585402,774181] 6953 0.099610326 -0.4513214 0.1331444733
## 7      [774188,972265] 6953 0.099610326 -0.4369762 0.1564590444
## 8     [972273,1357072] 6953 0.099610326 -0.4027883 0.1759510634
## 9    [1357076,2960907] 6953 0.099610326  0.3819991 0.1881965294
## 10   [2960909,3282409] 6953 0.099610326  0.8306179 0.2360178035
## 11   [3282457,5218801] 6954 0.099624653 -0.2961643 0.2460421728
plot_infotables(IV,"Outstanding.Balance")

customer_master_data$Outstanding.Balance_WoE <- ifelse(is.na(customer_master_data$Outstanding.Balance),
                IV$Tables$Outstanding.Balance$WOE[1],
                ifelse(customer_master_data$Outstanding.Balance <=6851,
                       IV$Tables$Outstanding.Balance$WOE[2],
                       ifelse(between(customer_master_data$Outstanding.Balance,6852,25590),
                              IV$Tables$Outstanding.Balance$WOE[3],
                              ifelse(between(customer_master_data$Outstanding.Balance,25600,386878),
                                     IV$Tables$Outstanding.Balance$WOE[4],
                                     ifelse(between(customer_master_data$Outstanding.Balance,386879,585389),
                                            IV$Tables$Outstanding.Balance$WOE[5],
                                           ifelse(between(customer_master_data$Outstanding.Balance,585402,774181),
           IV$Tables$Outstanding.Balance$WOE[6],
          ifelse(between(customer_master_data$Outstanding.Balance,774188,972265),
                 IV$Tables$Outstanding.Balance$WOE[7],
                 ifelse(between(customer_master_data$Outstanding.Balance,972273,1357072),
                        IV$Tables$Outstanding.Balance$WOE[8],
                        ifelse(between(customer_master_data$Outstanding.Balance,1357076,2960907),
                               IV$Tables$Outstanding.Balance$WOE[9],
                               ifelse(between(customer_master_data$Outstanding.Balance,2960909,3282409),
                                      IV$Tables$Outstanding.Balance$WOE[10],
                                      IV$Tables$Outstanding.Balance$WOE[11]))))))))))
unique(customer_master_data$Outstanding.Balance_WoE)
##  [1] -0.2961643 -0.4513214 -0.4369762  0.1340639 -0.4027883  0.7700212
##  [7] -0.2547105  0.8306179  0.3819991  0.9199346  0.3744148
# Calculate for rejected population
rejected_records$Outstanding.Balance_WoE <- ifelse(is.na(rejected_records$Outstanding.Balance),
                IV$Tables$Outstanding.Balance$WOE[1],
                ifelse(rejected_records$Outstanding.Balance <=6851,
                       IV$Tables$Outstanding.Balance$WOE[2],
                       ifelse(between(rejected_records$Outstanding.Balance,6852,25590),
                              IV$Tables$Outstanding.Balance$WOE[3],
                              ifelse(between(rejected_records$Outstanding.Balance,25600,386878),
                                     IV$Tables$Outstanding.Balance$WOE[4],
                                     ifelse(between(rejected_records$Outstanding.Balance,386879,585389),
                                            IV$Tables$Outstanding.Balance$WOE[5],
                                            ifelse(between(rejected_records$Outstanding.Balance,585402,774181),
            IV$Tables$Outstanding.Balance$WOE[6],
            ifelse(between(rejected_records$Outstanding.Balance,774188,972265),
                   IV$Tables$Outstanding.Balance$WOE[7],
                   ifelse(between(rejected_records$Outstanding.Balance,972273,1357072),
                          IV$Tables$Outstanding.Balance$WOE[8],
                          ifelse(between(rejected_records$Outstanding.Balance,1357076,2960907),
                                 IV$Tables$Outstanding.Balance$WOE[9],
                                 ifelse(between(rejected_records$Outstanding.Balance,2960909,3282409),
                                        IV$Tables$Outstanding.Balance$WOE[10],
                                        IV$Tables$Outstanding.Balance$WOE[11]))))))))))

unique(rejected_records$Outstanding.Balance_WoE)
## [1] -0.4027883 -0.4369762 -0.4513214  0.1340639 -0.2547105 -0.2961643
## [7]  0.8306179  0.3819991  0.9199346
# Replace 'NA' (WoE = 0.3744447) values with median of [1357118,2960907] (WoE = 0.3818808)
customer_master_data$Outstanding.Balance_imputed <- ifelse(is.na(customer_master_data$Outstanding.Balance),
                    median(filter(customer_master_data, Outstanding.Balance >=1357076 & Outstanding.Balance<=2960907) [, "Outstanding.Balance"]),
                    customer_master_data$Outstanding.Balance)
# No 'NA' values for Outstanding.Balance in Rejected population
sum(is.na(rejected_records$Outstanding.Balance))
## [1] 0
rejected_records$Outstanding.Balance_imputed <- rejected_records$Outstanding.Balance

# WoE Analysis for Avgas.CC.Utilization.in.last.12.months
print(IV$Tables$Avgas.CC.Utilization.in.last.12.months)
##    Avgas.CC.Utilization.in.last.12.months    N    Percent         WOE
## 1                                      NA 1018 0.01458411 -0.11599767
## 2                                   [0,4] 5521 0.07909515  0.80182190
## 3                                   [5,6] 5463 0.07826423  0.80062757
## 4                                   [7,8] 6856 0.09822068  0.79320847
## 5                                  [9,11] 9587 0.13734563  0.67681490
## 6                                 [12,14] 6585 0.09433827  0.46706002
## 7                                 [15,21] 6851 0.09814905  0.07916613
## 8                                 [22,37] 7116 0.10194550 -0.47533086
## 9                                 [38,51] 6742 0.09658749 -0.58461156
## 10                                [52,71] 7016 0.10051288 -0.56326710
## 11                               [72,113] 7047 0.10095699 -0.38102610
##             IV
## 1  0.000206996
## 2  0.036016027
## 3  0.071360890
## 4  0.115034194
## 5  0.161716538
## 6  0.178421559
## 7  0.179014873
## 8  0.207765296
## 9  0.251161250
## 10 0.292660548
## 11 0.310158603
plot_infotables(IV,"Avgas.CC.Utilization.in.last.12.months")

customer_master_data$Avgas.CC.Utilization.in.last.12.months_WoE <- ifelse(is.na(customer_master_data$Avgas.CC.Utilization.in.last.12.months),
                                   IV$Tables$Avgas.CC.Utilization.in.last.12.months$WOE[1],
                                   ifelse(customer_master_data$Avgas.CC.Utilization.in.last.12.months<=4,
                                          IV$Tables$Avgas.CC.Utilization.in.last.12.months$WOE[2],
                                          ifelse(between(customer_master_data$Avgas.CC.Utilization.in.last.12.months,5,6),
          IV$Tables$Avgas.CC.Utilization.in.last.12.months$WOE[3],
          ifelse(between(customer_master_data$Avgas.CC.Utilization.in.last.12.months,7,8),
                 IV$Tables$Avgas.CC.Utilization.in.last.12.months$WOE[4],
                 ifelse(between(customer_master_data$Avgas.CC.Utilization.in.last.12.months,9,11),
                        IV$Tables$Avgas.CC.Utilization.in.last.12.months$WOE[5],
                        ifelse(between(customer_master_data$Avgas.CC.Utilization.in.last.12.months,12,14),
                               IV$Tables$Avgas.CC.Utilization.in.last.12.months$WOE[6],
                               ifelse(between(customer_master_data$Avgas.CC.Utilization.in.last.12.months,15,21),
                                      IV$Tables$Avgas.CC.Utilization.in.last.12.months$WOE[7],
                                      ifelse(between(customer_master_data$Avgas.CC.Utilization.in.last.12.months,22,37),
                                             IV$Tables$Avgas.CC.Utilization.in.last.12.months$WOE[8],
                                             ifelse(between(customer_master_data$Avgas.CC.Utilization.in.last.12.months,38,51),
             IV$Tables$Avgas.CC.Utilization.in.last.12.months$WOE[9],
             ifelse(between(customer_master_data$Avgas.CC.Utilization.in.last.12.months,52,71),
                    IV$Tables$Avgas.CC.Utilization.in.last.12.months$WOE[10],
                    IV$Tables$Avgas.CC.Utilization.in.last.12.months$WOE[11]))))))))))
unique(customer_master_data$Avgas.CC.Utilization.in.last.12.months_WoE)
##  [1] -0.38102610  0.67681490 -0.47533086  0.07916613  0.80182190
##  [6]  0.80062757 -0.58461156  0.46706002 -0.56326710 -0.11599767
## [11]  0.79320847
# Calculate for rejected population
rejected_records$Avgas.CC.Utilization.in.last.12.months_WoE <- ifelse(is.na(rejected_records$Avgas.CC.Utilization.in.last.12.months),
                                   IV$Tables$Avgas.CC.Utilization.in.last.12.months$WOE[1],
                                   ifelse(rejected_records$Avgas.CC.Utilization.in.last.12.months<=4,
                                          IV$Tables$Avgas.CC.Utilization.in.last.12.months$WOE[2],
                                          ifelse(between(rejected_records$Avgas.CC.Utilization.in.last.12.months,5,6),
          IV$Tables$Avgas.CC.Utilization.in.last.12.months$WOE[3],
          ifelse(between(rejected_records$Avgas.CC.Utilization.in.last.12.months,7,8),
                 IV$Tables$Avgas.CC.Utilization.in.last.12.months$WOE[4],
                 ifelse(between(rejected_records$Avgas.CC.Utilization.in.last.12.months,9,11),
                        IV$Tables$Avgas.CC.Utilization.in.last.12.months$WOE[5],
                        ifelse(between(rejected_records$Avgas.CC.Utilization.in.last.12.months,12,14),
                               IV$Tables$Avgas.CC.Utilization.in.last.12.months$WOE[6],
                               ifelse(between(rejected_records$Avgas.CC.Utilization.in.last.12.months,15,21),
                                      IV$Tables$Avgas.CC.Utilization.in.last.12.months$WOE[7],
                                      ifelse(between(rejected_records$Avgas.CC.Utilization.in.last.12.months,22,37),
                                             IV$Tables$Avgas.CC.Utilization.in.last.12.months$WOE[8],
                                             ifelse(between(rejected_records$Avgas.CC.Utilization.in.last.12.months,38,51),
             IV$Tables$Avgas.CC.Utilization.in.last.12.months$WOE[9],
             ifelse(between(rejected_records$Avgas.CC.Utilization.in.last.12.months,52,71),
                    IV$Tables$Avgas.CC.Utilization.in.last.12.months$WOE[10],
                    IV$Tables$Avgas.CC.Utilization.in.last.12.months$WOE[11]))))))))))

unique(rejected_records$Avgas.CC.Utilization.in.last.12.months_WoE)
##  [1] -0.56326710 -0.38102610 -0.58461156 -0.47533086  0.07916613
##  [6]  0.46706002 -0.11599767  0.67681490  0.80062757  0.79320847
## [11]  0.80182190
# Replace 'NA' (WoE = -0.1159976) values with median of [72,113] (nearest WoE = -0.38102610)
customer_master_data$Avgas.CC.Utilization.in.last.12.months_imputed <- ifelse(is.na(customer_master_data$Avgas.CC.Utilization.in.last.12.months),
                                       median(filter(customer_master_data, Avgas.CC.Utilization.in.last.12.months >=72 & Avgas.CC.Utilization.in.last.12.months<=113) [, "Avgas.CC.Utilization.in.last.12.months"]),
                                       customer_master_data$Avgas.CC.Utilization.in.last.12.months)


# NA values exist for Avgas.CC.Utilization.in.last.12.months in rejected population
sum(is.na(rejected_records$Avgas.CC.Utilization.in.last.12.months))
## [1] 35
# [1] 35
rejected_records$Avgas.CC.Utilization.in.last.12.months_imputed <- ifelse(is.na(rejected_records$Avgas.CC.Utilization.in.last.12.months),
                                       median(filter(rejected_records, Avgas.CC.Utilization.in.last.12.months >=72 & Avgas.CC.Utilization.in.last.12.months<=113) [, "Avgas.CC.Utilization.in.last.12.months"]),
                                   rejected_records$Avgas.CC.Utilization.in.last.12.months)

sum(is.na(rejected_records$Avgas.CC.Utilization.in.last.12.months_imputed))
## [1] 0
# [1] 0

summary(customer_master_data)
##  Application.ID           Age           Gender         
##  Min.   :1.004e+05   Min.   :18.00   Length:69802      
##  1st Qu.:2.484e+08   1st Qu.:38.00   Class :character  
##  Median :4.979e+08   Median :45.00   Mode  :character  
##  Mean   :4.992e+08   Mean   :45.03                     
##  3rd Qu.:7.498e+08   3rd Qu.:53.00                     
##  Max.   :1.000e+09   Max.   :65.00                     
##                                                        
##  Marital.Status..at.the.time.of.application. No.of.dependents
##  Length:69802                                Min.   :0.00    
##  Class :character                            1st Qu.:2.00    
##  Mode  :character                            Median :3.00    
##                                              Mean   :2.86    
##                                              3rd Qu.:4.00    
##                                              Max.   :5.00    
##                                                              
##      Income       Education          Profession        Type.of.residence 
##  Min.   :-0.50   Length:69802       Length:69802       Length:69802      
##  1st Qu.:14.00   Class :character   Class :character   Class :character  
##  Median :27.00   Mode  :character   Mode  :character   Mode  :character  
##  Mean   :27.41                                                           
##  3rd Qu.:40.00                                                           
##  Max.   :60.00                                                           
##                                                                          
##  No.of.months.in.current.residence No.of.months.in.current.company
##  Min.   :  6.00                    Min.   :  3.00                 
##  1st Qu.:  6.00                    1st Qu.: 17.00                 
##  Median : 10.00                    Median : 34.00                 
##  Mean   : 34.57                    Mean   : 34.19                 
##  3rd Qu.: 61.00                    3rd Qu.: 51.00                 
##  Max.   :126.00                    Max.   :133.00                 
##                                                                   
##  Performance No.of.times.90.DPD.or.worse.in.last.6.months
##  0:66856     Min.   :0.0000                              
##  1: 2946     1st Qu.:0.0000                              
##              Median :0.0000                              
##              Mean   :0.2491                              
##              3rd Qu.:0.0000                              
##              Max.   :3.0000                              
##                                                          
##  No.of.times.60.DPD.or.worse.in.last.6.months
##  Min.   :0.0000                              
##  1st Qu.:0.0000                              
##  Median :0.0000                              
##  Mean   :0.3918                              
##  3rd Qu.:1.0000                              
##  Max.   :5.0000                              
##                                              
##  No.of.times.30.DPD.or.worse.in.last.6.months
##  Min.   :0.0000                              
##  1st Qu.:0.0000                              
##  Median :0.0000                              
##  Mean   :0.5236                              
##  3rd Qu.:1.0000                              
##  Max.   :7.0000                              
##                                              
##  No.of.times.90.DPD.or.worse.in.last.12.months
##  Min.   :0.000                                
##  1st Qu.:0.000                                
##  Median :0.000                                
##  Mean   :0.415                                
##  3rd Qu.:1.000                                
##  Max.   :5.000                                
##                                               
##  No.of.times.60.DPD.or.worse.in.last.12.months
##  Min.   :0.0000                               
##  1st Qu.:0.0000                               
##  Median :0.0000                               
##  Mean   :0.6035                               
##  3rd Qu.:1.0000                               
##  Max.   :7.0000                               
##                                               
##  No.of.times.30.DPD.or.worse.in.last.12.months
##  Min.   :0.000                                
##  1st Qu.:0.000                                
##  Median :0.000                                
##  Mean   :0.734                                
##  3rd Qu.:1.000                                
##  Max.   :9.000                                
##                                               
##  Avgas.CC.Utilization.in.last.12.months
##  Min.   :  0.00                        
##  1st Qu.:  8.00                        
##  Median : 15.00                        
##  Mean   : 29.27                        
##  3rd Qu.: 45.00                        
##  Max.   :113.00                        
##  NA's   :1018                          
##  No.of.trades.opened.in.last.6.months
##  Min.   : 0.000                      
##  1st Qu.: 1.000                      
##  Median : 2.000                      
##  Mean   : 2.286                      
##  3rd Qu.: 3.000                      
##  Max.   :12.000                      
##                                      
##  No.of.trades.opened.in.last.12.months
##  Min.   : 0.000                       
##  1st Qu.: 2.000                       
##  Median : 4.000                       
##  Mean   : 5.788                       
##  3rd Qu.: 9.000                       
##  Max.   :28.000                       
##                                       
##  No.of.PL.trades.opened.in.last.6.months
##  Min.   :0.00                           
##  1st Qu.:0.00                           
##  Median :1.00                           
##  Mean   :1.19                           
##  3rd Qu.:2.00                           
##  Max.   :6.00                           
##                                         
##  No.of.PL.trades.opened.in.last.12.months
##  Min.   : 0.000                          
##  1st Qu.: 0.000                          
##  Median : 2.000                          
##  Mean   : 2.365                          
##  3rd Qu.: 4.000                          
##  Max.   :12.000                          
##                                          
##  No.of.Inquiries.in.last.6.months..excluding.home...auto.loans.
##  Min.   : 0.000                                                
##  1st Qu.: 0.000                                                
##  Median : 1.000                                                
##  Mean   : 1.759                                                
##  3rd Qu.: 3.000                                                
##  Max.   :10.000                                                
##                                                                
##  No.of.Inquiries.in.last.12.months..excluding.home...auto.loans.
##  Min.   : 0.000                                                 
##  1st Qu.: 0.000                                                 
##  Median : 3.000                                                 
##  Mean   : 3.527                                                 
##  3rd Qu.: 5.000                                                 
##  Max.   :20.000                                                 
##                                                                 
##  Presence.of.open.home.loan Outstanding.Balance Total.No.of.Trades
##  Min.   :0.0000             Min.   :      0     Min.   : 0.000    
##  1st Qu.:0.0000             1st Qu.: 208494     1st Qu.: 3.000    
##  Median :0.0000             Median : 774191     Median : 6.000    
##  Mean   :0.2595             Mean   :1253107     Mean   : 8.178    
##  3rd Qu.:1.0000             3rd Qu.:2925974     3rd Qu.:10.000    
##  Max.   :1.0000             Max.   :5218801     Max.   :44.000    
##  NA's   :272                NA's   :272                           
##  Presence.of.open.auto.loan Income_imputed  Income_bin     age_bin     
##  Min.   :0.00000            Min.   : 1.00   <0   :  124   <35  :11361  
##  1st Qu.:0.00000            1st Qu.:14.00   1-10 :14031   35-45:24740  
##  Median :0.00000            Median :27.00   11-20:13392   46-55:20446  
##  Mean   :0.08487            Mean   :27.45   21-30:13628   >55  :13255  
##  3rd Qu.:0.00000            3rd Qu.:40.00   31-40:12179                
##  Max.   :1.00000            Max.   :60.00   >40  :16448                
##                                                                        
##  current_residence_bin current_company_bin Performance.Tag_forIV
##  <12    :36224         (1,12] :13480       Min.   :0.0000       
##  13-24  : 4368         (36,48]:12406       1st Qu.:1.0000       
##  25-36  : 4157         (24,36]:12102       Median :1.0000       
##  37-48  : 4060         (48,60]:12084       Mean   :0.9578       
##  49-60  : 3801         (12,24]:11864       3rd Qu.:1.0000       
##  61-72  : 3463         (60,72]: 6544       Max.   :1.0000       
##  (Other):13729         (Other): 1322                            
##  Education_WoE        Education_imputed  Income_imputed_WoE
##  Min.   :-0.4920041   Length:69802       Min.   :-0.31412  
##  1st Qu.:-0.0080432   Class :character   1st Qu.:-0.08142  
##  Median :-0.0080432   Mode  :character   Median :-0.02552  
##  Mean   : 0.0003301                      Mean   : 0.02026  
##  3rd Qu.: 0.0176500                      3rd Qu.: 0.17695  
##  Max.   : 0.0282585                      Max.   : 0.82915  
##                                                            
##  Presence.of.open.home.loan_WoE Presence.of.open.home.loan_imputed
##  Min.   :-0.073840              Min.   :0.0000                    
##  1st Qu.:-0.073840              1st Qu.:0.0000                    
##  Median :-0.073840              Median :0.0000                    
##  Mean   : 0.008352              Mean   :0.2624                    
##  3rd Qu.: 0.237377              3rd Qu.:1.0000                    
##  Max.   : 0.374415              Max.   :1.0000                    
##                                                                   
##  Outstanding.Balance_WoE Outstanding.Balance_imputed
##  Min.   :-0.4513         Min.   :      0            
##  1st Qu.:-0.4028         1st Qu.: 209114            
##  Median : 0.1341         Median : 775596            
##  Mean   : 0.1204         Mean   :1259625            
##  3rd Qu.: 0.7700         3rd Qu.:2925865            
##  Max.   : 0.9199         Max.   :5218801            
##                                                     
##  Avgas.CC.Utilization.in.last.12.months_WoE
##  Min.   :-0.58461                          
##  1st Qu.:-0.47533                          
##  Median : 0.07917                          
##  Mean   : 0.14708                          
##  3rd Qu.: 0.79321                          
##  Max.   : 0.80182                          
##                                            
##  Avgas.CC.Utilization.in.last.12.months_imputed
##  Min.   :  0.00                                
##  1st Qu.:  8.00                                
##  Median : 15.00                                
##  Mean   : 30.35                                
##  3rd Qu.: 46.00                                
##  Max.   :113.00                                
## 
summary(rejected_records)
##  Application.ID           Age           Gender         
##  Min.   :   207075   Min.   :22.00   Length:1425       
##  1st Qu.:232979991   1st Qu.:34.00   Class :character  
##  Median :476559413   Median :41.00   Mode  :character  
##  Mean   :485902049   Mean   :42.37                     
##  3rd Qu.:733733436   3rd Qu.:50.00                     
##  Max.   :997504566   Max.   :65.00                     
##                                                        
##  Marital.Status..at.the.time.of.application. No.of.dependents
##  Length:1425                                 Min.   :1.000   
##  Class :character                            1st Qu.:3.000   
##  Mode  :character                            Median :3.000   
##                                              Mean   :3.141   
##                                              3rd Qu.:4.000   
##                                              Max.   :5.000   
##                                                              
##      Income       Education          Profession        Type.of.residence 
##  Min.   : 4.50   Length:1425        Length:1425        Length:1425       
##  1st Qu.: 5.00   Class :character   Class :character   Class :character  
##  Median :11.00   Mode  :character   Mode  :character   Mode  :character  
##  Mean   :16.74                                                           
##  3rd Qu.:24.00                                                           
##  Max.   :60.00                                                           
##                                                                          
##  No.of.months.in.current.residence No.of.months.in.current.company
##  Min.   :  6.00                    Min.   : 3.00                  
##  1st Qu.:  6.00                    1st Qu.: 6.00                  
##  Median : 19.00                    Median :15.00                  
##  Mean   : 32.37                    Mean   :22.14                  
##  3rd Qu.: 47.00                    3rd Qu.:34.00                  
##  Max.   :126.00                    Max.   :75.00                  
##                                                                   
##  Performance No.of.times.90.DPD.or.worse.in.last.6.months
##  0   :   0   Min.   :0.000                               
##  1   :   0   1st Qu.:1.000                               
##  NA's:1425   Median :1.000                               
##              Mean   :1.316                               
##              3rd Qu.:2.000                               
##              Max.   :3.000                               
##                                                          
##  No.of.times.60.DPD.or.worse.in.last.6.months
##  Min.   :0.000                               
##  1st Qu.:2.000                               
##  Median :2.000                               
##  Mean   :2.335                               
##  3rd Qu.:3.000                               
##  Max.   :5.000                               
##                                              
##  No.of.times.30.DPD.or.worse.in.last.6.months
##  Min.   :0.00                                
##  1st Qu.:2.00                                
##  Median :3.00                                
##  Mean   :3.21                                
##  3rd Qu.:4.00                                
##  Max.   :7.00                                
##                                              
##  No.of.times.90.DPD.or.worse.in.last.12.months
##  Min.   :0.000                                
##  1st Qu.:1.000                                
##  Median :2.000                                
##  Mean   :2.191                                
##  3rd Qu.:3.000                                
##  Max.   :5.000                                
##                                               
##  No.of.times.60.DPD.or.worse.in.last.12.months
##  Min.   :0.000                                
##  1st Qu.:2.000                                
##  Median :3.000                                
##  Mean   :3.209                                
##  3rd Qu.:4.000                                
##  Max.   :7.000                                
##                                               
##  No.of.times.30.DPD.or.worse.in.last.12.months
##  Min.   :0.000                                
##  1st Qu.:3.000                                
##  Median :4.000                                
##  Mean   :4.086                                
##  3rd Qu.:5.000                                
##  Max.   :9.000                                
##                                               
##  Avgas.CC.Utilization.in.last.12.months
##  Min.   :  1.00                        
##  1st Qu.: 35.00                        
##  Median : 51.00                        
##  Mean   : 51.08                        
##  3rd Qu.: 67.00                        
##  Max.   :101.00                        
##  NA's   :35                            
##  No.of.trades.opened.in.last.6.months
##  Min.   :0.000                       
##  1st Qu.:2.000                       
##  Median :3.000                       
##  Mean   :2.933                       
##  3rd Qu.:4.000                       
##  Max.   :6.000                       
##                                      
##  No.of.trades.opened.in.last.12.months
##  Min.   : 0.000                       
##  1st Qu.: 6.000                       
##  Median : 8.000                       
##  Mean   : 7.884                       
##  3rd Qu.:10.000                       
##  Max.   :14.000                       
##                                       
##  No.of.PL.trades.opened.in.last.6.months
##  Min.   :0.000                          
##  1st Qu.:1.000                          
##  Median :2.000                          
##  Mean   :2.057                          
##  3rd Qu.:3.000                          
##  Max.   :4.000                          
##                                         
##  No.of.PL.trades.opened.in.last.12.months
##  Min.   :0.000                           
##  1st Qu.:3.000                           
##  Median :4.000                           
##  Mean   :4.064                           
##  3rd Qu.:5.000                           
##  Max.   :8.000                           
##                                          
##  No.of.Inquiries.in.last.6.months..excluding.home...auto.loans.
##  Min.   :0.000                                                 
##  1st Qu.:1.000                                                 
##  Median :2.000                                                 
##  Mean   :2.036                                                 
##  3rd Qu.:3.000                                                 
##  Max.   :4.000                                                 
##                                                                
##  No.of.Inquiries.in.last.12.months..excluding.home...auto.loans.
##  Min.   :0.000                                                  
##  1st Qu.:3.000                                                  
##  Median :4.000                                                  
##  Mean   :4.036                                                  
##  3rd Qu.:5.000                                                  
##  Max.   :8.000                                                  
##                                                                 
##  Presence.of.open.home.loan Outstanding.Balance Total.No.of.Trades
##  Min.   :0.00000            Min.   :  17155     Min.   : 1.000    
##  1st Qu.:0.00000            1st Qu.: 610175     1st Qu.: 7.000    
##  Median :0.00000            Median : 804672     Median : 9.000    
##  Mean   :0.09895            Mean   :1045252     Mean   : 8.772    
##  3rd Qu.:0.00000            3rd Qu.:1074624     3rd Qu.:11.000    
##  Max.   :1.00000            Max.   :4143938     Max.   :16.000    
##                                                                   
##  Presence.of.open.auto.loan Income_imputed  Income_imputed_WoE
##  Min.   :0.00000            Min.   : 4.50   Min.   :-0.31412  
##  1st Qu.:0.00000            1st Qu.: 5.00   1st Qu.:-0.31412  
##  Median :0.00000            Median :11.00   Median :-0.08142  
##  Mean   :0.07228            Mean   :16.74   Mean   :-0.12574  
##  3rd Qu.:0.00000            3rd Qu.:24.00   3rd Qu.:-0.06640  
##  Max.   :1.00000            Max.   :60.00   Max.   : 0.36098  
##                                                               
##  Outstanding.Balance_WoE Outstanding.Balance_imputed
##  Min.   :-0.4513         Min.   :  17155            
##  1st Qu.:-0.4370         1st Qu.: 610175            
##  Median :-0.4028         Median : 804672            
##  Mean   :-0.3014         Mean   :1045252            
##  3rd Qu.:-0.2547         3rd Qu.:1074624            
##  Max.   : 0.9199         Max.   :4143938            
##                                                     
##  Avgas.CC.Utilization.in.last.12.months_WoE
##  Min.   :-0.5846                           
##  1st Qu.:-0.5633                           
##  Median :-0.5633                           
##  Mean   :-0.4205                           
##  3rd Qu.:-0.3810                           
##  Max.   : 0.8018                           
##                                            
##  Avgas.CC.Utilization.in.last.12.months_imputed
##  Min.   :  1.00                                
##  1st Qu.: 36.00                                
##  Median : 52.00                                
##  Mean   : 51.81                                
##  3rd Qu.: 68.00                                
##  Max.   :101.00                                
## 

Feature Engineering - Encoding/Dummy Variables

# Using No.of.dependents as numerical only
# customer_master_data$No.of.dependents <- as.factor(customer_master_data$No.of.dependents)
# customer_master_data$No.of.dependents <- as.numeric(customer_master_data$No.of.dependents)

# Gender
customer_master_data$Gender <- as.factor(customer_master_data$Gender)
levels(customer_master_data$Gender) <- c(1,0)

# Marital Status
customer_master_data$Marital.Status..at.the.time.of.application. <- as.factor(customer_master_data$Marital.Status..at.the.time.of.application.)
levels(customer_master_data$Marital.Status..at.the.time.of.application.) <- c(1,0)

# Type of Residence
customer_master_data$Type.of.residence <- as.factor(customer_master_data$Type.of.residence)

# One-Hot encoding for Education_imputed
customer_master_data$Education_imputed <- as.factor(customer_master_data$Education_imputed)
dummy_education <- data.frame(model.matrix(~Education_imputed,
    data=customer_master_data))
dummy_education <- dummy_education[,-1]
customer_master_data <- cbind(customer_master_data, dummy_education)

# One-Hot encoding for Profession
customer_master_data$Profession <- as.factor(customer_master_data$Profession)
dummy_profession <- data.frame(model.matrix(~Profession,
     data=customer_master_data))
dummy_profession <- dummy_profession[,-1]
customer_master_data <- cbind(customer_master_data, dummy_profession)

# One-Hot encoding for Residence Type
customer_master_data$Type.of.residence <- as.factor(customer_master_data$Type.of.residence)
dummy_residencetype <- data.frame(model.matrix(~Type.of.residence,
        data=customer_master_data))
dummy_residencetype <- dummy_residencetype[,-1]
customer_master_data <- cbind(customer_master_data, dummy_residencetype)

# Creating a .CSV file with WoE values
write.csv(customer_master_data,"customer_master_data_cleaned_WoE_feature_engineering.csv")

Exploratory Data Analysis (EDA)

graph_data <- customer_master_data

# Columns considered to convert into factor
toFactor_colname <- c("Gender","Marital.Status..at.the.time.of.application.",
                      "No.of.dependents","Education","Profession",
                      "Income_bin","age_bin","current_residence_bin",
                      "current_company_bin","Type.of.residence",
                      "Presence.of.open.auto.loan","Presence.of.open.home.loan",
                      "No.of.times.30.DPD.or.worse.in.last.12.months",
                      "No.of.times.30.DPD.or.worse.in.last.6.months",
                      "No.of.times.60.DPD.or.worse.in.last.12.months",
                      "No.of.times.60.DPD.or.worse.in.last.6.months",
                      "No.of.times.90.DPD.or.worse.in.last.12.months",
                      "No.of.times.90.DPD.or.worse.in.last.6.months")


graph_data[toFactor_colname] <- lapply(graph_data[toFactor_colname],factor)

graph_data$Performance <- as.factor(ifelse(graph_data$Performance==0,
                                           "Non-Defaulters",
                                           "Defaulters"))

ggplot(graph_data,aes(x=Performance,fill=Performance)) +
  geom_bar() +  
  geom_text(stat = "count", aes(y = ((..count..)/sum(..count..)), 
                                label = scales::percent((..count..)/sum(..count..))),
            vjust =-0.25)

# Only 4.2% defaulters and this is an unbalanced data

str(graph_data)
## 'data.frame':    69802 obs. of  54 variables:
##  $ Application.ID                                                 : int  100450 128993 142768 176721 197956 203973 210394 223052 237197 247959 ...
##  $ Age                                                            : int  52 36 55 55 28 43 42 51 44 59 ...
##  $ Gender                                                         : Factor w/ 2 levels "1","0": 2 2 2 2 1 1 2 2 2 2 ...
##  $ Marital.Status..at.the.time.of.application.                    : Factor w/ 2 levels "1","0": 1 1 1 1 1 2 1 1 1 1 ...
##  $ No.of.dependents                                               : Factor w/ 6 levels "0","1","2","3",..: 5 5 2 4 4 2 3 4 5 6 ...
##  $ Income                                                         : num  32 13 29 53 35 35 43 4.5 5 40 ...
##  $ Education                                                      : Factor w/ 6 levels "","Bachelor",..: 6 6 5 6 3 6 2 3 6 3 ...
##  $ Profession                                                     : Factor w/ 3 levels "SAL","SE","SE_PROF": 3 1 1 3 1 1 3 1 1 1 ...
##  $ Type.of.residence                                              : Factor w/ 5 levels "Company provided",..: 5 5 5 5 5 5 5 5 5 4 ...
##  $ No.of.months.in.current.residence                              : int  79 6 46 6 6 6 6 83 6 6 ...
##  $ No.of.months.in.current.company                                : int  3 21 3 27 43 52 3 48 38 5 ...
##  $ Performance                                                    : Factor w/ 2 levels "Defaulters","Non-Defaulters": 2 1 2 2 2 2 2 2 2 2 ...
##  $ No.of.times.90.DPD.or.worse.in.last.6.months                   : Factor w/ 4 levels "0","1","2","3": 1 2 1 1 1 1 1 2 1 1 ...
##  $ No.of.times.60.DPD.or.worse.in.last.6.months                   : Factor w/ 6 levels "0","1","2","3",..: 1 2 2 1 1 1 1 2 1 1 ...
##  $ No.of.times.30.DPD.or.worse.in.last.6.months                   : Factor w/ 8 levels "0","1","2","3",..: 1 3 2 1 1 1 1 2 1 1 ...
##  $ No.of.times.90.DPD.or.worse.in.last.12.months                  : Factor w/ 6 levels "0","1","2","3",..: 1 2 2 1 1 1 1 2 1 1 ...
##  $ No.of.times.60.DPD.or.worse.in.last.12.months                  : Factor w/ 8 levels "0","1","2","3",..: 1 2 2 1 1 1 1 2 1 1 ...
##  $ No.of.times.30.DPD.or.worse.in.last.12.months                  : Factor w/ 10 levels "0","1","2","3",..: 1 3 2 1 1 1 1 2 1 1 ...
##  $ Avgas.CC.Utilization.in.last.12.months                         : int  113 9 34 18 3 6 0 48 13 16 ...
##  $ No.of.trades.opened.in.last.6.months                           : num  2 1 2 0 7 1 1 2 1 2 ...
##  $ No.of.trades.opened.in.last.12.months                          : int  8 6 7 1 13 2 2 5 3 4 ...
##  $ No.of.PL.trades.opened.in.last.6.months                        : int  2 1 2 0 5 0 0 2 0 1 ...
##  $ No.of.PL.trades.opened.in.last.12.months                       : int  5 4 4 1 5 0 1 3 1 1 ...
##  $ No.of.Inquiries.in.last.6.months..excluding.home...auto.loans. : int  1 4 1 2 6 0 1 2 2 1 ...
##  $ No.of.Inquiries.in.last.12.months..excluding.home...auto.loans.: int  3 7 2 3 11 0 3 5 4 1 ...
##  $ Presence.of.open.home.loan                                     : Factor w/ 2 levels "0","1": 2 1 1 1 1 1 1 1 1 1 ...
##  $ Outstanding.Balance                                            : int  3903438 741058 815325 209593 992024 556 202816 575772 204444 199818 ...
##  $ Total.No.of.Trades                                             : int  9 8 9 3 25 5 4 6 5 7 ...
##  $ Presence.of.open.auto.loan                                     : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ Income_imputed                                                 : num  32 13 29 53 35 35 43 4.5 5 40 ...
##  $ Income_bin                                                     : Ord.factor w/ 6 levels "<0"<"1-10"<"11-20"<..: 5 3 4 6 5 5 6 2 2 5 ...
##  $ age_bin                                                        : Ord.factor w/ 4 levels "<35"<"35-45"<..: 3 2 4 4 1 2 2 3 2 4 ...
##  $ current_residence_bin                                          : Ord.factor w/ 11 levels "<12"<"13-24"<..: 7 1 4 1 1 1 1 7 1 1 ...
##  $ current_company_bin                                            : Ord.factor w/ 10 levels "(1,12]"<"(12,24]"<..: 1 2 1 3 4 5 1 4 4 1 ...
##  $ Performance.Tag_forIV                                          : num  1 0 1 1 1 1 1 1 1 1 ...
##  $ Education_WoE                                                  : num  0.01765 0.01765 0.02826 0.01765 -0.00804 ...
##  $ Education_imputed                                              : Factor w/ 5 levels "Bachelor","Masters",..: 5 5 4 5 2 5 1 2 5 2 ...
##  $ Income_imputed_WoE                                             : num  0.1551 -0.0664 -0.0796 0.361 0.1551 ...
##  $ Presence.of.open.home.loan_WoE                                 : num  0.2374 -0.0738 -0.0738 -0.0738 -0.0738 ...
##  $ Presence.of.open.home.loan_imputed                             : num  1 0 0 0 0 0 0 0 0 0 ...
##  $ Outstanding.Balance_WoE                                        : num  -0.296 -0.451 -0.437 0.134 -0.403 ...
##  $ Outstanding.Balance_imputed                                    : int  3903438 741058 815325 209593 992024 556 202816 575772 204444 199818 ...
##  $ Avgas.CC.Utilization.in.last.12.months_WoE                     : num  -0.381 0.6768 -0.4753 0.0792 0.8018 ...
##  $ Avgas.CC.Utilization.in.last.12.months_imputed                 : int  113 9 34 18 3 6 0 48 13 16 ...
##  $ Education_imputedMasters                                       : num  0 0 0 0 1 0 0 1 0 1 ...
##  $ Education_imputedOthers                                        : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Education_imputedPhd                                           : num  0 0 1 0 0 0 0 0 0 0 ...
##  $ Education_imputedProfessional                                  : num  1 1 0 1 0 1 0 0 1 0 ...
##  $ ProfessionSE                                                   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ ProfessionSE_PROF                                              : num  1 0 0 1 0 0 1 0 0 0 ...
##  $ Type.of.residenceLiving.with.Parents                           : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Type.of.residenceOthers                                        : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Type.of.residenceOwned                                         : num  0 0 0 0 0 0 0 0 0 1 ...
##  $ Type.of.residenceRented                                        : num  1 1 1 1 1 1 1 1 1 0 ...
graph_data_categorical <- graph_data[,sapply(graph_data,is.factor)]
graph_data_continuous <- graph_data[,!sapply(graph_data,is.factor)]
graph_data_continuous <- graph_data_continuous[,-c(1,2)]

graph_data_categorical <- graph_data_categorical %>% dplyr::select(-Performance,Performance)

View(graph_data_categorical)
View(graph_data_continuous)

Plots for Independent factor variables

gather(graph_data_categorical, x, y, Gender:current_company_bin) %>%
  ggplot(aes(x = y, color = Performance, fill = Performance)) +
  geom_density(alpha = 0.3) +
  facet_wrap( ~ x, scales = "free", ncol = 3)

Independent continuous variable graph

# Histograms
# Excluding variables of Type - Original with NA/Missing Values, Dummy Variables & WoE Values
graph_data_continuous [, -c(1, 4, 11, 14, 15, 16, 
                            17, 19, 21, 23, 24, 25, 
                            26, 27, 28, 29, 30, 31, 32) ] %>% 
  gather() %>%
  ggplot(aes(value)) +
  facet_wrap( ~ key, scales = "free") + geom_histogram()

## boxplots
graph_data_continuous_1 <- graph_data_continuous %>% dplyr::select(-Outstanding.Balance,Outstanding.Balance)
new_col_names <- c("Age","Income","No.curr.resi","No.curr.comp",
                   "AvgCC.Util.12","trades_6","trades_12",
                   "PL_6","PL_12","inq_6_auto","In_12_auto",
                   "total_trade","Perf","Outstanding.balance")

colnames(graph_data_continuous_1) <- new_col_names

temp <- melt(graph_data_continuous_1[,1:13],id.vars = "Perf")
ggplot(data = temp, aes(x=variable, y=value)) + geom_boxplot(aes(fill=Perf))

boxplot(graph_data_continuous_1$Outstanding.balance)

Multivariate analysis

graph_data_multivariate <- dplyr::filter(graph_data,graph_data$Performance=="Defaulters")

ggplot(data = graph_data_multivariate, aes(x = age_bin,y=Performance, fill = Income_bin)) +
  geom_bar(aes(y = prop.table(..count..) * 100),
           position = "dodge") + 
  geom_text(aes(y = round(prop.table(..count..) * 100 + 0.5,2), 
                label = paste0(round(prop.table(..count..) * 100,2), '%')), 
            stat = 'count', 
            position = position_dodge(.9), 
            size = 3) + 
  labs(x = 'Age group', y = 'Defaulters', fill = 'Income Group')

ggplot(data = graph_data_multivariate, aes(x = age_bin,y=Performance, fill = Gender)) +
  geom_bar(aes(y = prop.table(..count..) * 100),
           position = "dodge") + 
  geom_text(aes(y = round(prop.table(..count..) * 100 + 0.5,2), 
                label = paste0(round(prop.table(..count..) * 100,2), '%')), 
            stat = 'count', 
            position = position_dodge(.9), 
            size = 3) + 
  labs(x = 'Age group', y = 'Defaulters', fill = 'Gender')

ggplot(data = graph_data_multivariate, aes(x = Income_bin,y=Performance, fill = Gender)) +
  geom_bar(aes(y = prop.table(..count..) * 100),
           position = "dodge") + 
  geom_text(aes(y = round(prop.table(..count..) * 100 + 0.5,2), 
                label = paste0(round(prop.table(..count..) * 100,2), '%')), 
            stat = 'count', 
            position = position_dodge(.9), 
            size = 3) + 
  labs(x = 'Income group', y = 'Defaulters', fill = 'Gender')

ggplot(data = graph_data_multivariate, aes(x = No.of.dependents,y=Performance, fill = Income_bin)) +
  geom_bar(aes(y = prop.table(..count..) * 100),
           position = "dodge") + 
  geom_text(aes(y = round(prop.table(..count..) * 100 + 0.5,2), 
                label = paste0(round(prop.table(..count..) * 100,2), '%')), 
            stat = 'count', 
            position = position_dodge(.9), 
            size = 3) + 
  labs(x = 'Number of Dependents', y = 'Defaulters', fill = 'Income group')

ggplot(data = graph_data_multivariate, aes(x = Type.of.residence,y=Performance, fill = Income_bin)) +
  geom_bar(aes(y = prop.table(..count..) * 100),
           position = "dodge") + 
  geom_text(aes(y = round(prop.table(..count..) * 100 + 0.5,2), 
                label = paste0(round(prop.table(..count..) * 100,2), '%')), 
            stat = 'count', 
            position = position_dodge(.9), 
            size = 3) + 
  labs(x = 'Type of Residence', y = 'Defaulters', fill = 'Income group')

EDA Inferences

Top 7 Importat variables are highlighted

Variable Importance Conclusion

Age - High - Age group of 35-55 is significant
Gender - Low - Not significant feature
Marital Status - Medium - EDA also confirms Married Significant
No of dependents - Low - Not significant feature
Income - Low - Not significant feature
Education - Low - Not significant feature
Profession - High - Salaried is significant with high frequency
Type of residence - High - Rented is the most significant with high
No of months in current residence - High - < 12 months is high frequency
No of months in current company - Medium - EDA also confirms <24 Months has significant default
No of times 90 DPD or worse in last 6 months - Medium - Higher the number has default effect
No of times 60 DPD or worse in last 6 months - Medium - Higher the number has default effect
No of times 30 DPD or worse in last 6 months - Medium - Higher the number has default effect
No of times 90 DPD or worse in last 12 months - Medium - Higher the number has default effect
No of times 60 DPD or worse in last 12 months - Medium - Higher the number has default effect
No of times 30 DPD or worse in last 12 months - Low - Not significant feature
Avgas CC Utilization in last 12 months - High - Most of the utilization are <20
No of trades opened in last 6 months - Low - Not significant feature
No of trades opened in last 12 months - Low - Not significant feature
No of PL trades opened in last 6 months - Low - Not significant feature
No of PL trades opened in last 12 months - Low - Not significant feature
No of Inquiries in last 6 months (excluding home & auto loans) - Low - Not significant feature
No of Inquiries in last 12 months (excluding home & auto loans) - Low - Not significant feature
Presence of open home loan - Low - Not significant feature
Outstanding Balance - Low - Not significant feature
Total No of Trades - Low - Not significant feature
Presence of open auto loan - Low - Not significant feature

Feature Selection - Based on WoE/IV and Correlation Analysis

# Feature Selection based on WoE/IV = 0.02 to 0.5
arrange(IV_Value [IV_Value$IV >=0.02, ], desc(IV))[1]
##                                                           Variable
## 1                           Avgas.CC.Utilization.in.last.12.months
## 2                            No.of.trades.opened.in.last.12.months
## 3                         No.of.PL.trades.opened.in.last.12.months
## 4  No.of.Inquiries.in.last.12.months..excluding.home...auto.loans.
## 5                                              Outstanding.Balance
## 6                     No.of.times.30.DPD.or.worse.in.last.6.months
## 7                                               Total.No.of.Trades
## 8                          No.of.PL.trades.opened.in.last.6.months
## 9                    No.of.times.90.DPD.or.worse.in.last.12.months
## 10                    No.of.times.60.DPD.or.worse.in.last.6.months
## 11  No.of.Inquiries.in.last.6.months..excluding.home...auto.loans.
## 12                   No.of.times.30.DPD.or.worse.in.last.12.months
## 13                            No.of.trades.opened.in.last.6.months
## 14                   No.of.times.60.DPD.or.worse.in.last.12.months
## 15                    No.of.times.90.DPD.or.worse.in.last.6.months
## 16                               No.of.months.in.current.residence
## 17                                           current_residence_bin
## 18                                                  Income_imputed
## 19                                                          Income
## 20                                                      Income_bin
## 21                                 No.of.months.in.current.company

Correlation Matrix

# Correlation matrix
features_for_correlationMatrix <-
  c(
    "Income_imputed",
    "No.of.months.in.current.company",
    "No.of.months.in.current.residence",
    
    "Avgas.CC.Utilization.in.last.12.months_WoE",
    "Avgas.CC.Utilization.in.last.12.months_imputed",
    "Outstanding.Balance_WoE",
    "Outstanding.Balance_imputed",
    #"Presence.of.open.home.loan_WoE",
    #"Presence.of.open.home.loan_imputed",
    
    
    "Total.No.of.Trades",
    "No.of.trades.opened.in.last.6.months",
    "No.of.trades.opened.in.last.12.months",
    "No.of.PL.trades.opened.in.last.6.months",
    "No.of.PL.trades.opened.in.last.12.months",
    
    "No.of.Inquiries.in.last.12.months..excluding.home...auto.loans.",
    "No.of.Inquiries.in.last.6.months..excluding.home...auto.loans.",
    
    "No.of.times.30.DPD.or.worse.in.last.6.months",
    "No.of.times.60.DPD.or.worse.in.last.6.months",
    "No.of.times.90.DPD.or.worse.in.last.6.months",
    "No.of.times.30.DPD.or.worse.in.last.12.months",
    "No.of.times.60.DPD.or.worse.in.last.12.months",
    "No.of.times.90.DPD.or.worse.in.last.12.months"
  )
plot_correlationMatrix (customer_master_data, features_for_correlationMatrix)

Correlation Analysis

  1. All DPD columns are highly correlated (0.8 to 0.95), chosing one variable with high IV value range (0.02 to 0.5) i.e. “No.of.times.30.DPD.or.worse.in.last.6.months”
  2. All Trades related features are highly correlated 0.6 to 0.94, chosing only with high WoE value i.e. No.of.trades.opened.in.last.12.months"
  3. Feature “Outstanding.Balance_WoE” is correlated (0.65) with “Avgas.CC.Utilization.in.last.12.months_WoE” but considering based on business intution. Also considering “Outstanding.Balance_imputed”
  4. Discarding both “Presence.of.open.home.loan_WoE” and “Presence.of.open.home.loan_imputed” because they both are highly correlated (0.93 and 0.94 respectively) with “Outstanding.Balance_imputed”
## Final Feature selection
#  Following are list of features which are not having high correlation -0.5 to 0.5 with other featues

demographic_data_features <- c("Income_imputed",
                               "No.of.months.in.current.company",
                               "No.of.months.in.current.residence")

creditbureau_data_features <- c(    "Avgas.CC.Utilization.in.last.12.months_WoE",
                                    "Avgas.CC.Utilization.in.last.12.months_imputed",
                                    "Outstanding.Balance_WoE",
                                    "Outstanding.Balance_imputed",
                                    "No.of.times.30.DPD.or.worse.in.last.6.months",
                                    "No.of.trades.opened.in.last.12.months")

Model Building - Split Train (and Validation) & Test Datasets

# Not using scale technique as there is no gain/loss with model performance/accuracy metrics
# customer_master_data[, scale_col] <- sapply(customer_master_data[, scale_col], scale)
View(customer_master_data)

set.seed(100)
# Randomly divide the data into training and test sets (stratified by class)
index <- createDataPartition(customer_master_data$Performance, p = 0.7, list = FALSE)
train_data <- customer_master_data[index, ]
summary(train_data$Performance)
##     0     1 
## 46800  2063
2063/(46800+2063)*100
## [1] 4.222008
test_data  <- customer_master_data[-index, ]
summary(test_data$Performance)
##     0     1 
## 20056   883
883/(20056+883)*100
## [1] 4.217011
test_actual_default <- factor(ifelse(test_data$Performance ==1,"Yes","No"))

Model Evaluation - Common Functions

# Function for Choosing the optimal probalility cutoff
perform_fn <- function(cutoff, test_data_prediction) 
{
  predicted_default <- factor(ifelse(test_data_prediction >= cutoff, "Yes", "No"))
  conf <- confusionMatrix(predicted_default, test_actual_default, positive = "Yes")
  acc <- conf$overall[1]
  sens <- conf$byClass[1]
  spec <- conf$byClass[2]
  out <- t(as.matrix(c(sens, spec, acc))) 
  colnames(out) <- c("sensitivity", "specificity", "accuracy")
  return(out)
}

# Function for calculating Optimal Cutoff
findOptimalCutOff <- function (test_data_prediction, thresholdStart, thresholdEnd) {
  # Summary of test probability
  summary(test_data_prediction)
  
  #s = seq(.03,.14,length=100)
  s = seq(thresholdStart, thresholdEnd,length=100)
  OUT = matrix(0,100,3)
  
  for(i in 1:100)
  {
    OUT[i,] = perform_fn(s[i], test_data_prediction)
  } 
  
  
  plot(s, OUT[,1],xlab="Cutoff",ylab="Value",cex.lab=1.5,cex.axis=1.5,ylim=c(0,1),type="l",lwd=2,axes=FALSE,col=2)
  axis(1,seq(0,1,length=5),seq(0,1,length=5),cex.lab=1.5)
  axis(2,seq(0,1,length=5),seq(0,1,length=5),cex.lab=1.5)
  lines(s,OUT[,2],col="darkgreen",lwd=2)
  lines(s,OUT[,3],col=4,lwd=2)
  box()
  legend(0.75,0.50,col=c(2,"darkgreen",4,"darkred"),lwd=c(2,2,2,2),c("Sensitivity","Specificity","Accuracy"))
  
  cutoff <- s[which(abs(OUT[,1]-OUT[,2])<0.02)]
  
  cat("Optimal Cutoff = ", round(cutoff,3)[1])
  cat ("\n\n")
  
  return(round(cutoff,3)[1])
}

## Function for calculation of lift and cumulative gain
lift <- function(labels , predicted_prob, groups=10) {
  
  if(is.factor(labels)) labels  <- as.integer(as.character(labels ))
  if(is.factor(predicted_prob)) predicted_prob <- as.integer(as.character(predicted_prob))
  helper = data.frame(cbind(labels , predicted_prob))
  helper[,"bucket"] = ntile(-helper[,"predicted_prob"], groups)
  gaintable = helper %>% group_by(bucket)  %>%
    summarise_at(vars(labels ), funs(total = n(),
                                     totalresp=sum(., na.rm = TRUE))) %>%
    mutate(Cumresp = cumsum(totalresp),
           Gain=Cumresp/sum(totalresp)*100,
           Cumlift=Gain/(bucket*(100/groups)))
  return(gaintable)
}

# For plotting the gain chart and to compute KS Statistic
GainLiftChart_KSStatistic <- function(model,data, value) {
  
  temp <- data
  temp$Predict <- predict(model,type=value,newdata=temp)
  
  LG = lift(temp$Performance, temp$Predict, groups = 10)
  # Gain Chart 
  plot(LG$bucket,LG$Gain,col="red",type="l",main="Gain Chart",xlab="% of total targeted",ylab = "% of positive Response")
  
  # Lift Chart 
  plot(LG$bucket,LG$Cumlift,col="blue",type="l",main="Lift Chart",xlab="% of total targeted",ylab = "Lift")
  write.csv(LG,"Lift-CumulativeGain-table.csv")
  
  # KS-Statistic
  if(value=="raw"){
    pred_object_test<- prediction(as.numeric(temp$Predict), as.numeric(temp$Performance))
  }else{
    pred_object_test<- prediction(temp$Predict,temp$Performance)
  }
  
  performance_measures_test<- performance(pred_object_test, "tpr", "fpr")
  ks_table_test <- attr(performance_measures_test, "y.values")[[1]] - 
    (attr(performance_measures_test, "x.values")[[1]])
  
  #LG$KS <- ks_table_test
  print(LG)
  max(ks_table_test)
  
}

evaluateClassificationModel <- function (test_pred, test_actual_default, cutOff) {  
  
  # Get optimal cut-off 
  #cutOff <- findOptimalCutOff(test_pred, thresholdStart, thresholdEnd)
  
  test_pred_default <- factor(ifelse(test_pred >= cutOff, "Yes", "No"))
  table(test_actual_default,test_pred_default)
  
  #install.packages("e1071")
  library(e1071)
  
  test_conf <- confusionMatrix(test_pred_default, test_actual_default, positive = "Yes")
  acc <- test_conf$overall[1]
  sens <- test_conf$byClass[1]
  spec <- test_conf$byClass[2]
  
  print(test_conf)
  
  precission_recall_f <- accuracy.meas(test_actual_default, test_pred_default, cutOff)
  
  # Using only F score 
  
  # Precision can be seen as a measure of exactness or quality, whereas recall is a measure of completeness or 
  # quantity. In simple terms, high precision means that an algorithm returned substantially more relevant 
  # results than irrelevant ones, while high recall means that an algorithm returned most of the relevant results.
  
  roc_metrics <- roc.curve(test_actual_default, test_pred_default, plotit = T)
  
  metrics <- data.frame(Accuracy=acc, 
                        Sensitivity=sens, 
                        Specificity = spec,
                        F_score=precission_recall_f$F,
                        Threshold=precission_recall_f$threshold,
                        AUC=roc_metrics$auc,
                        False_positive_Rate=roc_metrics$false.positive.rate[2],
                        True_positive_Rate=roc_metrics$true.positive.rate[2])
  
  print(metrics)
  
  return(metrics)
}

Model Building (Unbalanced) - Logistic Regression with Demographic Data

# Simple Logistic Regression model Using Demographic data 
# Using WoE Variables as derived features in addition original features (imputed for mising / incorrect values)
# As WoE is created based on Target Encoding, in general is not directly correlated with base variable

# Using WoE Variable (Note: results are same with Income_imputed as well instead of Income_imputed_WoE)
# Also Income_imputed becomes significant when both are used and they both are highly correlated

logistic_model_demographic_data_unbalanced <- glm(formula = Performance ~  No.of.months.in.current.residence + 
                                                                Income_imputed_WoE +
                                                                No.of.months.in.current.company, 
                                                 family = "binomial", 
                                                 data = train_data [, -1])
summary (logistic_model_demographic_data_unbalanced)
## 
## Call:
## glm(formula = Performance ~ No.of.months.in.current.residence + 
##     Income_imputed_WoE + No.of.months.in.current.company, family = "binomial", 
##     data = train_data[, -1])
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.4036  -0.3147  -0.2860  -0.2605   2.7860  
## 
## Coefficients:
##                                     Estimate Std. Error z value Pr(>|z|)
## (Intercept)                       -3.0071342  0.0496656 -60.548  < 2e-16
## No.of.months.in.current.residence  0.0018688  0.0005978   3.126  0.00177
## Income_imputed_WoE                -1.0279515  0.1093138  -9.404  < 2e-16
## No.of.months.in.current.company   -0.0054741  0.0011243  -4.869 1.12e-06
##                                      
## (Intercept)                       ***
## No.of.months.in.current.residence ** 
## Income_imputed_WoE                ***
## No.of.months.in.current.company   ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 17096  on 48862  degrees of freedom
## Residual deviance: 16971  on 48859  degrees of freedom
## AIC: 16979
## 
## Number of Fisher Scoring iterations: 6
# AIC: 16983
sort(vif(logistic_model_demographic_data_unbalanced),decreasing = TRUE)
##                Income_imputed_WoE   No.of.months.in.current.company 
##                          1.015650                          1.015548 
## No.of.months.in.current.residence 
##                          1.015372
# Income_imputed_WoE No.of.months.in.current.residence   No.of.months.in.current.company 
# 1.017824                          1.016867                          1.016003 

test_pred = predict(logistic_model_demographic_data_unbalanced, type = "response", 
                    newdata = test_data)

summary(test_pred)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.01471 0.03453 0.04085 0.04224 0.04879 0.07793
cutOff <- findOptimalCutOff(test_pred, .03,.14)

## Optimal Cutoff =  0.042
# Optimal Cutoff =  0.042

logistic_model_demographic_data_metrics <- evaluateClassificationModel(test_pred,
                            test_actual_default, 
                            cutOff)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    No   Yes
##        No  10813   394
##        Yes  9243   489
##                                          
##                Accuracy : 0.5398         
##                  95% CI : (0.533, 0.5465)
##     No Information Rate : 0.9578         
##     P-Value [Acc > NIR] : 1              
##                                          
##                   Kappa : 0.0161         
##  Mcnemar's Test P-Value : <2e-16         
##                                          
##             Sensitivity : 0.55379        
##             Specificity : 0.53914        
##          Pos Pred Value : 0.05025        
##          Neg Pred Value : 0.96484        
##              Prevalence : 0.04217        
##          Detection Rate : 0.02335        
##    Detection Prevalence : 0.46478        
##       Balanced Accuracy : 0.54647        
##                                          
##        'Positive' Class : Yes            
## 

##           Accuracy Sensitivity Specificity    F_score Threshold       AUC
## Accuracy 0.5397583   0.5537939   0.5391404 0.04046375     0.042 0.5464671
##          False_positive_Rate True_positive_Rate
## Accuracy           0.4608596          0.5537939
rownames(logistic_model_demographic_data_metrics) <- "DemographicData - GLM - Unbalanced"
model_Metrics <- rbind(logistic_model_demographic_data_metrics)

# test_pred_default <- factor(ifelse(test_pred >= 0.040, "Yes", "No"))
# Accuracy    : 0.5373
# Sensitivity : 0.55606
# Specificity : 0.53650
# 
# F           : 0.040
#
# Area under the curve (AUC): 0.546

Model Building (Unbalanced) - Logistic Regression with Demographic & Credit Bureau Data

# Using WoE Variables as derived features in addition original features (imputed for mising / incorrect values)
# As WoE is created based on Target Encoding, in general is not directly correlated with base variable

logistic_model_application_and_creditdata <- glm(formula = Performance ~ Income_imputed + 
                                  No.of.months.in.current.company +
                                  No.of.months.in.current.residence +
                                  Avgas.CC.Utilization.in.last.12.months_WoE +
                                  Avgas.CC.Utilization.in.last.12.months_imputed +
                                  Outstanding.Balance_WoE +
                                  Outstanding.Balance_imputed +
                                  No.of.times.30.DPD.or.worse.in.last.6.months +
                                  No.of.trades.opened.in.last.12.months, 
          family = "binomial", 
          data = train_data [, -1])
summary (logistic_model_application_and_creditdata)
## 
## Call:
## glm(formula = Performance ~ Income_imputed + No.of.months.in.current.company + 
##     No.of.months.in.current.residence + Avgas.CC.Utilization.in.last.12.months_WoE + 
##     Avgas.CC.Utilization.in.last.12.months_imputed + Outstanding.Balance_WoE + 
##     Outstanding.Balance_imputed + No.of.times.30.DPD.or.worse.in.last.6.months + 
##     No.of.trades.opened.in.last.12.months, family = "binomial", 
##     data = train_data[, -1])
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.6628  -0.3459  -0.2568  -0.1929   2.9267  
## 
## Coefficients:
##                                                  Estimate Std. Error
## (Intercept)                                    -3.321e+00  9.715e-02
## Income_imputed                                 -2.277e-03  1.563e-03
## No.of.months.in.current.company                -2.086e-03  1.132e-03
## No.of.months.in.current.residence              -1.231e-03  6.731e-04
## Avgas.CC.Utilization.in.last.12.months_WoE     -5.179e-01  7.208e-02
## Avgas.CC.Utilization.in.last.12.months_imputed  2.838e-03  1.013e-03
## Outstanding.Balance_WoE                        -2.478e-01  7.023e-02
## Outstanding.Balance_imputed                    -1.089e-08  1.969e-08
## No.of.times.30.DPD.or.worse.in.last.6.months    1.692e-01  2.045e-02
## No.of.trades.opened.in.last.12.months           2.458e-02  5.929e-03
##                                                z value Pr(>|z|)    
## (Intercept)                                    -34.189  < 2e-16 ***
## Income_imputed                                  -1.457 0.145137    
## No.of.months.in.current.company                 -1.844 0.065240 .  
## No.of.months.in.current.residence               -1.829 0.067357 .  
## Avgas.CC.Utilization.in.last.12.months_WoE      -7.185 6.74e-13 ***
## Avgas.CC.Utilization.in.last.12.months_imputed   2.802 0.005083 ** 
## Outstanding.Balance_WoE                         -3.529 0.000418 ***
## Outstanding.Balance_imputed                     -0.553 0.580165    
## No.of.times.30.DPD.or.worse.in.last.6.months     8.273  < 2e-16 ***
## No.of.trades.opened.in.last.12.months            4.146 3.38e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 17096  on 48862  degrees of freedom
## Residual deviance: 16354  on 48853  degrees of freedom
## AIC: 16374
## 
## Number of Fisher Scoring iterations: 6
# AIC: 16374

logistic_model_application_and_creditdata_2 <- stepAIC(logistic_model_application_and_creditdata, direction="both")
## Start:  AIC=16373.54
## Performance ~ Income_imputed + No.of.months.in.current.company + 
##     No.of.months.in.current.residence + Avgas.CC.Utilization.in.last.12.months_WoE + 
##     Avgas.CC.Utilization.in.last.12.months_imputed + Outstanding.Balance_WoE + 
##     Outstanding.Balance_imputed + No.of.times.30.DPD.or.worse.in.last.6.months + 
##     No.of.trades.opened.in.last.12.months
## 
##                                                  Df Deviance   AIC
## - Outstanding.Balance_imputed                     1    16354 16372
## <none>                                                 16354 16374
## - Income_imputed                                  1    16356 16374
## - No.of.months.in.current.residence               1    16357 16375
## - No.of.months.in.current.company                 1    16357 16375
## - Avgas.CC.Utilization.in.last.12.months_imputed  1    16361 16379
## - Outstanding.Balance_WoE                         1    16366 16384
## - No.of.trades.opened.in.last.12.months           1    16370 16388
## - Avgas.CC.Utilization.in.last.12.months_WoE      1    16406 16424
## - No.of.times.30.DPD.or.worse.in.last.6.months    1    16418 16436
## 
## Step:  AIC=16371.84
## Performance ~ Income_imputed + No.of.months.in.current.company + 
##     No.of.months.in.current.residence + Avgas.CC.Utilization.in.last.12.months_WoE + 
##     Avgas.CC.Utilization.in.last.12.months_imputed + Outstanding.Balance_WoE + 
##     No.of.times.30.DPD.or.worse.in.last.6.months + No.of.trades.opened.in.last.12.months
## 
##                                                  Df Deviance   AIC
## <none>                                                 16354 16372
## - Income_imputed                                  1    16356 16372
## - No.of.months.in.current.company                 1    16357 16373
## - No.of.months.in.current.residence               1    16357 16373
## + Outstanding.Balance_imputed                     1    16354 16374
## - Avgas.CC.Utilization.in.last.12.months_imputed  1    16362 16378
## - Outstanding.Balance_WoE                         1    16367 16383
## - No.of.trades.opened.in.last.12.months           1    16370 16386
## - Avgas.CC.Utilization.in.last.12.months_WoE      1    16406 16422
## - No.of.times.30.DPD.or.worse.in.last.6.months    1    16419 16435
summary(logistic_model_application_and_creditdata_2)
## 
## Call:
## glm(formula = Performance ~ Income_imputed + No.of.months.in.current.company + 
##     No.of.months.in.current.residence + Avgas.CC.Utilization.in.last.12.months_WoE + 
##     Avgas.CC.Utilization.in.last.12.months_imputed + Outstanding.Balance_WoE + 
##     No.of.times.30.DPD.or.worse.in.last.6.months + No.of.trades.opened.in.last.12.months, 
##     family = "binomial", data = train_data[, -1])
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.6613  -0.3459  -0.2565  -0.1930   2.9202  
## 
## Coefficients:
##                                                  Estimate Std. Error
## (Intercept)                                    -3.3289262  0.0961958
## Income_imputed                                 -0.0022912  0.0015630
## No.of.months.in.current.company                -0.0020907  0.0011318
## No.of.months.in.current.residence              -0.0012437  0.0006727
## Avgas.CC.Utilization.in.last.12.months_WoE     -0.5177126  0.0720487
## Avgas.CC.Utilization.in.last.12.months_imputed  0.0028376  0.0010127
## Outstanding.Balance_WoE                        -0.2522001  0.0697798
## No.of.times.30.DPD.or.worse.in.last.6.months    0.1694480  0.0204417
## No.of.trades.opened.in.last.12.months           0.0237502  0.0057367
##                                                z value Pr(>|z|)    
## (Intercept)                                    -34.606  < 2e-16 ***
## Income_imputed                                  -1.466 0.142681    
## No.of.months.in.current.company                 -1.847 0.064712 .  
## No.of.months.in.current.residence               -1.849 0.064485 .  
## Avgas.CC.Utilization.in.last.12.months_WoE      -7.186 6.69e-13 ***
## Avgas.CC.Utilization.in.last.12.months_imputed   2.802 0.005078 ** 
## Outstanding.Balance_WoE                         -3.614 0.000301 ***
## No.of.times.30.DPD.or.worse.in.last.6.months     8.289  < 2e-16 ***
## No.of.trades.opened.in.last.12.months            4.140 3.47e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 17096  on 48862  degrees of freedom
## Residual deviance: 16354  on 48854  degrees of freedom
## AIC: 16372
## 
## Number of Fisher Scoring iterations: 6
# AIC: 16372

# Removing Income_imputed as p-value = 0.142681
logistic_model_application_and_creditdata_3 <- glm(formula = Performance ~ 
                                    No.of.months.in.current.company +
                                    No.of.months.in.current.residence +
                                    Avgas.CC.Utilization.in.last.12.months_WoE +
                                    Avgas.CC.Utilization.in.last.12.months_imputed +
                                    Outstanding.Balance_WoE +
                                    Outstanding.Balance_imputed +
                                    No.of.times.30.DPD.or.worse.in.last.6.months +
                                    No.of.trades.opened.in.last.12.months,
                                         family = "binomial", 
                                         data = train_data[, -1])

summary(logistic_model_application_and_creditdata_3)
## 
## Call:
## glm(formula = Performance ~ No.of.months.in.current.company + 
##     No.of.months.in.current.residence + Avgas.CC.Utilization.in.last.12.months_WoE + 
##     Avgas.CC.Utilization.in.last.12.months_imputed + Outstanding.Balance_WoE + 
##     Outstanding.Balance_imputed + No.of.times.30.DPD.or.worse.in.last.6.months + 
##     No.of.trades.opened.in.last.12.months, family = "binomial", 
##     data = train_data[, -1])
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.6689  -0.3461  -0.2565  -0.1926   2.9160  
## 
## Coefficients:
##                                                  Estimate Std. Error
## (Intercept)                                    -3.393e+00  8.375e-02
## No.of.months.in.current.company                -1.904e-03  1.124e-03
## No.of.months.in.current.residence              -1.165e-03  6.712e-04
## Avgas.CC.Utilization.in.last.12.months_WoE     -5.253e-01  7.193e-02
## Avgas.CC.Utilization.in.last.12.months_imputed  2.833e-03  1.013e-03
## Outstanding.Balance_WoE                        -2.516e-01  7.021e-02
## Outstanding.Balance_imputed                    -1.135e-08  1.969e-08
## No.of.times.30.DPD.or.worse.in.last.6.months    1.731e-01  2.026e-02
## No.of.trades.opened.in.last.12.months           2.493e-02  5.922e-03
##                                                z value Pr(>|z|)    
## (Intercept)                                    -40.521  < 2e-16 ***
## No.of.months.in.current.company                 -1.694 0.090301 .  
## No.of.months.in.current.residence               -1.735 0.082706 .  
## Avgas.CC.Utilization.in.last.12.months_WoE      -7.303 2.81e-13 ***
## Avgas.CC.Utilization.in.last.12.months_imputed   2.798 0.005143 ** 
## Outstanding.Balance_WoE                         -3.583 0.000339 ***
## Outstanding.Balance_imputed                     -0.576 0.564429    
## No.of.times.30.DPD.or.worse.in.last.6.months     8.544  < 2e-16 ***
## No.of.trades.opened.in.last.12.months            4.210 2.55e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 17096  on 48862  degrees of freedom
## Residual deviance: 16356  on 48854  degrees of freedom
## AIC: 16374
## 
## Number of Fisher Scoring iterations: 6
# AIC: 16374
sort(vif(logistic_model_application_and_creditdata_3),decreasing = TRUE)
##     Avgas.CC.Utilization.in.last.12.months_WoE 
##                                       2.736091 
##                        Outstanding.Balance_WoE 
##                                       1.987564 
## Avgas.CC.Utilization.in.last.12.months_imputed 
##                                       1.923009 
##          No.of.trades.opened.in.last.12.months 
##                                       1.528385 
##   No.of.times.30.DPD.or.worse.in.last.6.months 
##                                       1.329917 
##              No.of.months.in.current.residence 
##                                       1.111627 
##                    Outstanding.Balance_imputed 
##                                       1.076381 
##                No.of.months.in.current.company 
##                                       1.031837
# Removing Outstanding.Balance_imputed due to high p-value = 0.564429
logistic_model_application_and_creditdata_4 <- glm(formula = Performance ~ 
                                    No.of.months.in.current.company +
                                    No.of.months.in.current.residence +
                                    Avgas.CC.Utilization.in.last.12.months_WoE +
                                    Avgas.CC.Utilization.in.last.12.months_imputed +
                                    Outstanding.Balance_WoE +
                                    No.of.times.30.DPD.or.worse.in.last.6.months +
                                    No.of.trades.opened.in.last.12.months,
                                         family = "binomial", 
                                         data = train_data[, -1])

summary(logistic_model_application_and_creditdata_4)
## 
## Call:
## glm(formula = Performance ~ No.of.months.in.current.company + 
##     No.of.months.in.current.residence + Avgas.CC.Utilization.in.last.12.months_WoE + 
##     Avgas.CC.Utilization.in.last.12.months_imputed + Outstanding.Balance_WoE + 
##     No.of.times.30.DPD.or.worse.in.last.6.months + No.of.trades.opened.in.last.12.months, 
##     family = "binomial", data = train_data[, -1])
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.6673  -0.3461  -0.2562  -0.1928   2.9210  
## 
## Coefficients:
##                                                  Estimate Std. Error
## (Intercept)                                    -3.4018513  0.0825110
## No.of.months.in.current.company                -0.0019075  0.0011243
## No.of.months.in.current.residence              -0.0011771  0.0006707
## Avgas.CC.Utilization.in.last.12.months_WoE     -0.5252183  0.0718986
## Avgas.CC.Utilization.in.last.12.months_imputed  0.0028331  0.0010125
## Outstanding.Balance_WoE                        -0.2561625  0.0697547
## No.of.times.30.DPD.or.worse.in.last.6.months    0.1734419  0.0202531
## No.of.trades.opened.in.last.12.months           0.0240682  0.0057306
##                                                z value Pr(>|z|)    
## (Intercept)                                    -41.229  < 2e-16 ***
## No.of.months.in.current.company                 -1.697  0.08977 .  
## No.of.months.in.current.residence               -1.755  0.07927 .  
## Avgas.CC.Utilization.in.last.12.months_WoE      -7.305 2.77e-13 ***
## Avgas.CC.Utilization.in.last.12.months_imputed   2.798  0.00514 ** 
## Outstanding.Balance_WoE                         -3.672  0.00024 ***
## No.of.times.30.DPD.or.worse.in.last.6.months     8.564  < 2e-16 ***
## No.of.trades.opened.in.last.12.months            4.200 2.67e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 17096  on 48862  degrees of freedom
## Residual deviance: 16356  on 48855  degrees of freedom
## AIC: 16372
## 
## Number of Fisher Scoring iterations: 6
# AIC: 16372
sort(vif(logistic_model_application_and_creditdata_4),decreasing = TRUE)
##     Avgas.CC.Utilization.in.last.12.months_WoE 
##                                       2.733765 
##                        Outstanding.Balance_WoE 
##                                       1.959824 
## Avgas.CC.Utilization.in.last.12.months_imputed 
##                                       1.922838 
##          No.of.trades.opened.in.last.12.months 
##                                       1.430790 
##   No.of.times.30.DPD.or.worse.in.last.6.months 
##                                       1.328788 
##              No.of.months.in.current.residence 
##                                       1.110156 
##                No.of.months.in.current.company 
##                                       1.031806
# Removing No.of.months.in.current.company as p-value = 0.08977
logistic_model_application_and_creditdata_5 <- glm(formula = Performance ~  
                                    No.of.months.in.current.residence +
                                    Avgas.CC.Utilization.in.last.12.months_WoE +
                                    Avgas.CC.Utilization.in.last.12.months_imputed +
                                    Outstanding.Balance_WoE +
                                    No.of.times.30.DPD.or.worse.in.last.6.months +
                                    No.of.trades.opened.in.last.12.months,
                                         family = "binomial", 
                                         data = train_data[, -1])

summary(logistic_model_application_and_creditdata_5)
## 
## Call:
## glm(formula = Performance ~ No.of.months.in.current.residence + 
##     Avgas.CC.Utilization.in.last.12.months_WoE + Avgas.CC.Utilization.in.last.12.months_imputed + 
##     Outstanding.Balance_WoE + No.of.times.30.DPD.or.worse.in.last.6.months + 
##     No.of.trades.opened.in.last.12.months, family = "binomial", 
##     data = train_data[, -1])
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.6663  -0.3463  -0.2562  -0.1927   2.9200  
## 
## Coefficients:
##                                                  Estimate Std. Error
## (Intercept)                                    -3.4690713  0.0725715
## No.of.months.in.current.residence              -0.0011004  0.0006689
## Avgas.CC.Utilization.in.last.12.months_WoE     -0.5280311  0.0718502
## Avgas.CC.Utilization.in.last.12.months_imputed  0.0028043  0.0010118
## Outstanding.Balance_WoE                        -0.2548958  0.0697123
## No.of.times.30.DPD.or.worse.in.last.6.months    0.1778615  0.0200813
## No.of.trades.opened.in.last.12.months           0.0238627  0.0057267
##                                                z value Pr(>|z|)    
## (Intercept)                                    -47.802  < 2e-16 ***
## No.of.months.in.current.residence               -1.645 0.099945 .  
## Avgas.CC.Utilization.in.last.12.months_WoE      -7.349 2.00e-13 ***
## Avgas.CC.Utilization.in.last.12.months_imputed   2.772 0.005579 ** 
## Outstanding.Balance_WoE                         -3.656 0.000256 ***
## No.of.times.30.DPD.or.worse.in.last.6.months     8.857  < 2e-16 ***
## No.of.trades.opened.in.last.12.months            4.167 3.09e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 17096  on 48862  degrees of freedom
## Residual deviance: 16359  on 48856  degrees of freedom
## AIC: 16373
## 
## Number of Fisher Scoring iterations: 6
# AIC: 16368
sort(vif(logistic_model_application_and_creditdata_5),decreasing = TRUE)
##     Avgas.CC.Utilization.in.last.12.months_WoE 
##                                       2.729035 
##                        Outstanding.Balance_WoE 
##                                       1.956879 
## Avgas.CC.Utilization.in.last.12.months_imputed 
##                                       1.921977 
##          No.of.trades.opened.in.last.12.months 
##                                       1.429294 
##   No.of.times.30.DPD.or.worse.in.last.6.months 
##                                       1.307147 
##              No.of.months.in.current.residence 
##                                       1.104710
# Removing No.of.months.in.current.residence due to high p-value = 0.099945
logistic_model_application_and_creditdata_6 <- glm(formula = Performance ~  
                                    Avgas.CC.Utilization.in.last.12.months_WoE +
                                    Avgas.CC.Utilization.in.last.12.months_imputed +
                                    Outstanding.Balance_WoE +
                                    No.of.times.30.DPD.or.worse.in.last.6.months +
                                    No.of.trades.opened.in.last.12.months,
            
                                         family = "binomial", data = train_data[, -1])
summary(logistic_model_application_and_creditdata_6)
## 
## Call:
## glm(formula = Performance ~ Avgas.CC.Utilization.in.last.12.months_WoE + 
##     Avgas.CC.Utilization.in.last.12.months_imputed + Outstanding.Balance_WoE + 
##     No.of.times.30.DPD.or.worse.in.last.6.months + No.of.trades.opened.in.last.12.months, 
##     family = "binomial", data = train_data[, -1])
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.6573  -0.3465  -0.2578  -0.1915   2.8857  
## 
## Coefficients:
##                                                 Estimate Std. Error
## (Intercept)                                    -3.511071   0.068084
## Avgas.CC.Utilization.in.last.12.months_WoE     -0.514043   0.071294
## Avgas.CC.Utilization.in.last.12.months_imputed  0.002667   0.001008
## Outstanding.Balance_WoE                        -0.250073   0.069589
## No.of.times.30.DPD.or.worse.in.last.6.months    0.177867   0.020088
## No.of.trades.opened.in.last.12.months           0.024971   0.005687
##                                                z value Pr(>|z|)    
## (Intercept)                                    -51.570  < 2e-16 ***
## Avgas.CC.Utilization.in.last.12.months_WoE      -7.210 5.59e-13 ***
## Avgas.CC.Utilization.in.last.12.months_imputed   2.645 0.008180 ** 
## Outstanding.Balance_WoE                         -3.594 0.000326 ***
## No.of.times.30.DPD.or.worse.in.last.6.months     8.854  < 2e-16 ***
## No.of.trades.opened.in.last.12.months            4.391 1.13e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 17096  on 48862  degrees of freedom
## Residual deviance: 16362  on 48857  degrees of freedom
## AIC: 16374
## 
## Number of Fisher Scoring iterations: 6
# AIC: 16374
sort(vif(logistic_model_application_and_creditdata_6),decreasing = TRUE)
##     Avgas.CC.Utilization.in.last.12.months_WoE 
##                                       2.690655 
##                        Outstanding.Balance_WoE 
##                                       1.954449 
## Avgas.CC.Utilization.in.last.12.months_imputed 
##                                       1.909329 
##          No.of.trades.opened.in.last.12.months 
##                                       1.408925 
##   No.of.times.30.DPD.or.worse.in.last.6.months 
##                                       1.305800
# Removing Avgas.CC.Utilization.in.last.12.months_imputed
# Avgas.CC.Utilization.in.last.12.months_WoE - VIF=2.690655, p-value=5.59e-13
# Avgas.CC.Utilization.in.last.12.months_imputed - VIF=1.909329 ,p-value=0.008180
cor(train_data$Avgas.CC.Utilization.in.last.12.months_imputed, train_data$Avgas.CC.Utilization.in.last.12.months_WoE)
## [1] -0.7412454
# -0.7412454

logistic_model_application_and_creditdata_7 <- glm(formula = Performance ~  
                                    Avgas.CC.Utilization.in.last.12.months_WoE +
                                    Outstanding.Balance_WoE +
                                    No.of.times.30.DPD.or.worse.in.last.6.months +
                                    No.of.trades.opened.in.last.12.months,
            family = "binomial", data = train_data[, -1])
summary(logistic_model_application_and_creditdata_7)
## 
## Call:
## glm(formula = Performance ~ Avgas.CC.Utilization.in.last.12.months_WoE + 
##     Outstanding.Balance_WoE + No.of.times.30.DPD.or.worse.in.last.6.months + 
##     No.of.trades.opened.in.last.12.months, family = "binomial", 
##     data = train_data[, -1])
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.6415  -0.3463  -0.2530  -0.1922   2.8780  
## 
## Coefficients:
##                                               Estimate Std. Error z value
## (Intercept)                                  -3.382956   0.046997 -71.982
## Avgas.CC.Utilization.in.last.12.months_WoE   -0.622885   0.057758 -10.784
## Outstanding.Balance_WoE                      -0.264025   0.069537  -3.797
## No.of.times.30.DPD.or.worse.in.last.6.months  0.172489   0.020023   8.615
## No.of.trades.opened.in.last.12.months         0.020781   0.005475   3.796
##                                              Pr(>|z|)    
## (Intercept)                                   < 2e-16 ***
## Avgas.CC.Utilization.in.last.12.months_WoE    < 2e-16 ***
## Outstanding.Balance_WoE                      0.000147 ***
## No.of.times.30.DPD.or.worse.in.last.6.months  < 2e-16 ***
## No.of.trades.opened.in.last.12.months        0.000147 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 17096  on 48862  degrees of freedom
## Residual deviance: 16368  on 48858  degrees of freedom
## AIC: 16378
## 
## Number of Fisher Scoring iterations: 6
# AIC: 16378
sort(vif(logistic_model_application_and_creditdata_7),decreasing = TRUE)
##                      Outstanding.Balance_WoE 
##                                     1.954735 
##   Avgas.CC.Utilization.in.last.12.months_WoE 
##                                     1.776972 
##        No.of.trades.opened.in.last.12.months 
##                                     1.297773 
## No.of.times.30.DPD.or.worse.in.last.6.months 
##                                     1.292529
# Outstanding.Balance_WoE   Avgas.CC.Utilization.in.last.12.months_WoE        No.of.trades.opened.in.last.12.months No.of.times.30.DPD.or.worse.in.last.6.months 
#                1.954735                                     1.776972                                     1.297773                                     1.292529 

logistic_model_application_and_creditdata_unbalanced <- logistic_model_application_and_creditdata_7

test_pred = predict(logistic_model_application_and_creditdata_unbalanced, type = "response", 
                    newdata = test_data)


summary(test_pred)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.01590 0.01915 0.03548 0.04245 0.06008 0.17121
cutOff <- findOptimalCutOff(test_pred, .01,.17)

## Optimal Cutoff =  0.049
# Optimal Cutoff =  0.049

logistic_model_application_and_creditdata_unbalanced_metrics <- evaluateClassificationModel(test_pred,
                            test_actual_default, 
                            cutOff)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    No   Yes
##        No  12717   338
##        Yes  7339   545
##                                           
##                Accuracy : 0.6334          
##                  95% CI : (0.6268, 0.6399)
##     No Information Rate : 0.9578          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.0525          
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.61721         
##             Specificity : 0.63407         
##          Pos Pred Value : 0.06913         
##          Neg Pred Value : 0.97411         
##              Prevalence : 0.04217         
##          Detection Rate : 0.02603         
##    Detection Prevalence : 0.37652         
##       Balanced Accuracy : 0.62564         
##                                           
##        'Positive' Class : Yes             
## 

##           Accuracy Sensitivity Specificity    F_score Threshold       AUC
## Accuracy 0.6333636    0.617214   0.6340746 0.04046375     0.049 0.6256443
##          False_positive_Rate True_positive_Rate
## Accuracy           0.3659254           0.617214
rownames(logistic_model_application_and_creditdata_unbalanced_metrics) <- "FullData        - GLM - Unbalanced"
model_Metrics <- rbind(model_Metrics, logistic_model_application_and_creditdata_unbalanced_metrics)
# Optimal Cutoff =  0.049

# Accuracy    : 0.6334
# Sensitivity : 0.61721
# Specificity : 0.63407
#
# F           : 0.04046375
#
# Area under the curve (AUC): 0.6256443

Cross Validation & Sampling with Demographic and Credit Bureau Data

## Logistic Regression - Using Under Sampling
t8 <- Sys.time()
model_glm_fullCustomerData_undersampling <- caret::train(Performance ~  
                                Income_imputed + 
                                No.of.months.in.current.company +
                                No.of.months.in.current.residence +
                                Avgas.CC.Utilization.in.last.12.months_WoE +
                                Avgas.CC.Utilization.in.last.12.months_imputed +
                                Outstanding.Balance_WoE +
                                Outstanding.Balance_imputed +
                                No.of.times.30.DPD.or.worse.in.last.6.months +
                                No.of.trades.opened.in.last.12.months,
                 data = train_data [, -1],
                 method = "glm",
                 family="binomial",
                 preProcess = c("scale", "center"),
                 tuneLength = 5,
                 trControl = trainControl(method = "cv", 
                                          number = 5, 
                                          verboseIter = TRUE,
                                          sampling = "down"))
## + Fold1: parameter=none 
## - Fold1: parameter=none 
## + Fold2: parameter=none 
## - Fold2: parameter=none 
## + Fold3: parameter=none 
## - Fold3: parameter=none 
## + Fold4: parameter=none 
## - Fold4: parameter=none 
## + Fold5: parameter=none 
## - Fold5: parameter=none 
## Aggregating results
## Fitting final model on full training set
test_pred_fullCustomerData_glm_undersampling <- predict(model_glm_fullCustomerData_undersampling, 
                type = "prob",
                newdata = test_data)
t9 <- Sys.time()

t9-t8
## Time difference of 2.388349 secs
# Time difference of 2.016878 secs

summary(test_pred_fullCustomerData_glm_undersampling[,2])
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.1994  0.2951  0.4598  0.4584  0.5971  0.8931
# Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
# 0.2164  0.2935  0.4670  0.4590  0.6035  0.8538 
cutOff <- findOptimalCutOff(test_pred_fullCustomerData_glm_undersampling[,2], 0.20, 0.85)

## Optimal Cutoff =  0.541
# Optimal Cutoff =  0.541

model_glm_fullCustomerData_undersampling_metrics <- evaluateClassificationModel(test_pred_fullCustomerData_glm_undersampling[,2],
                            test_actual_default, 
                            cutOff)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    No   Yes
##        No  12735   333
##        Yes  7321   550
##                                          
##                Accuracy : 0.6345         
##                  95% CI : (0.6279, 0.641)
##     No Information Rate : 0.9578         
##     P-Value [Acc > NIR] : 1              
##                                          
##                   Kappa : 0.0539         
##  Mcnemar's Test P-Value : <2e-16         
##                                          
##             Sensitivity : 0.62288        
##             Specificity : 0.63497        
##          Pos Pred Value : 0.06988        
##          Neg Pred Value : 0.97452        
##              Prevalence : 0.04217        
##          Detection Rate : 0.02627        
##    Detection Prevalence : 0.37590        
##       Balanced Accuracy : 0.62892        
##                                          
##        'Positive' Class : Yes            
## 

##          Accuracy Sensitivity Specificity    F_score Threshold       AUC
## Accuracy 0.634462   0.6228766   0.6349721 0.04046375     0.541 0.6289243
##          False_positive_Rate True_positive_Rate
## Accuracy           0.3650279          0.6228766
rownames(model_glm_fullCustomerData_undersampling_metrics) <- "FullData        - GLM - Under-Sampling"
model_Metrics <- rbind(model_Metrics, model_glm_fullCustomerData_undersampling_metrics)

# Accuracy    : 0.6345
# Sensitivity : 0.62288
# Specificity : 0.06988
#
# F: 0.04046375
#
# Area under the curve (AUC): 0.6289243

# -------------------------------- Logistic Regression - Using Over Sampling

t8 <- Sys.time()
model_glm_fullCustomerData_oversampling <- caret::train(Performance ~  
                    Income_imputed + 
                    No.of.months.in.current.company +
                    No.of.months.in.current.residence +
                    Avgas.CC.Utilization.in.last.12.months_WoE +
                    Avgas.CC.Utilization.in.last.12.months_imputed +
                    Outstanding.Balance_WoE +
                    Outstanding.Balance_imputed +
                    No.of.times.30.DPD.or.worse.in.last.6.months +
                    No.of.trades.opened.in.last.12.months,
                  data = train_data [, -1],
                  method = "glm",
                  family="binomial",
                  preProcess = c("scale", "center"),
                  tuneLength = 5,
                  trControl = trainControl(method = "cv", 
                                           number = 5, 
                                           verboseIter = TRUE,
                                           sampling = "up"))
## + Fold1: parameter=none 
## - Fold1: parameter=none 
## + Fold2: parameter=none 
## - Fold2: parameter=none 
## + Fold3: parameter=none 
## - Fold3: parameter=none 
## + Fold4: parameter=none 
## - Fold4: parameter=none 
## + Fold5: parameter=none 
## - Fold5: parameter=none 
## Aggregating results
## Fitting final model on full training set
test_pred_fullCustomerData_glm_oversampling <- predict(model_glm_fullCustomerData_oversampling, 
                 type = "prob",
                 newdata = test_data)
t9 <- Sys.time()

t9-t8
## Time difference of 7.357556 secs
# Time difference of 7.105892 secs

summary(test_pred_fullCustomerData_glm_oversampling[,2])
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.2206  0.2991  0.4604  0.4581  0.5917  0.8737
# Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
# 0.2224  0.3040  0.4597  0.4587  0.5897  0.8765  
cutOff <- findOptimalCutOff(test_pred_fullCustomerData_glm_oversampling[,2], 0.2, 0.87)

## Optimal Cutoff =  0.532
# Optimal Cutoff =  0.532

model_glm_fullCustomerData_oversampling_metrics <- evaluateClassificationModel(test_pred_fullCustomerData_glm_oversampling[,2],
                            test_actual_default, 
                            cutOff)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    No   Yes
##        No  12468   319
##        Yes  7588   564
##                                          
##                Accuracy : 0.6224         
##                  95% CI : (0.6158, 0.629)
##     No Information Rate : 0.9578         
##     P-Value [Acc > NIR] : 1              
##                                          
##                   Kappa : 0.0528         
##  Mcnemar's Test P-Value : <2e-16         
##                                          
##             Sensitivity : 0.63873        
##             Specificity : 0.62166        
##          Pos Pred Value : 0.06919        
##          Neg Pred Value : 0.97505        
##              Prevalence : 0.04217        
##          Detection Rate : 0.02694        
##    Detection Prevalence : 0.38932        
##       Balanced Accuracy : 0.63020        
##                                          
##        'Positive' Class : Yes            
## 

##           Accuracy Sensitivity Specificity    F_score Threshold       AUC
## Accuracy 0.6223793   0.6387316   0.6216594 0.04046375     0.532 0.6301955
##          False_positive_Rate True_positive_Rate
## Accuracy           0.3783406          0.6387316
rownames(model_glm_fullCustomerData_oversampling_metrics) <- "FullData        - GLM - Over-Sampling"
model_Metrics <- rbind(model_Metrics, model_glm_fullCustomerData_oversampling_metrics)

# Accuracy    : 0.6264865
# Sensitivity : 0.6375991
# Specificity : 0.6259972
#
# F: 0.04046375
#
# Area under the curve (AUC): 0.6317982
#
# -------------------------------- Logistic Regression - Using SMOTE
t8 <- Sys.time()
model_glm_fullCustomerData_smote <- caret::train(Performance ~  
                   Income_imputed + 
                   No.of.months.in.current.company +
                   No.of.months.in.current.residence +
                   Avgas.CC.Utilization.in.last.12.months_WoE +
                   Avgas.CC.Utilization.in.last.12.months_imputed +
                   Outstanding.Balance_WoE +
                   Outstanding.Balance_imputed +
                   No.of.times.30.DPD.or.worse.in.last.6.months +
                   No.of.trades.opened.in.last.12.months,
                 data = train_data [, -1],
                 method = "glm",
                 family="binomial",
                 preProcess = c("scale", "center"),
                 tuneLength = 5,
                 trControl = trainControl(method = "cv", 
                                          number = 5, 
                                          verboseIter = TRUE,
                                          sampling = "smote"))
## + Fold1: parameter=none 
## - Fold1: parameter=none 
## + Fold2: parameter=none 
## - Fold2: parameter=none 
## + Fold3: parameter=none 
## - Fold3: parameter=none 
## + Fold4: parameter=none 
## - Fold4: parameter=none 
## + Fold5: parameter=none 
## - Fold5: parameter=none 
## Aggregating results
## Fitting final model on full training set
test_pred_fullCustomerData_glm_smote <- predict(model_glm_fullCustomerData_smote, 
                type = "prob",
                newdata = test_data)
t9 <- Sys.time()

t9-t8
## Time difference of 9.730844 secs
# Time difference of 10.60025 secs

summary(test_pred_fullCustomerData_glm_smote[,2])
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.1595  0.2487  0.3860  0.3943  0.5191  0.8426
# Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
# 0.1617  0.2449  0.3912  0.3948  0.5260  0.8312  
cutOff <- findOptimalCutOff(test_pred_fullCustomerData_glm_smote[,2], 0.16, 0.83)

## Optimal Cutoff =  0.458
# Optimal Cutoff =  0.458

model_glm_fullCustomerData_smotesampling_metrics <- evaluateClassificationModel(test_pred_fullCustomerData_glm_smote[,2],
                            test_actual_default, 
                            cutOff)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    No   Yes
##        No  12568   319
##        Yes  7488   564
##                                           
##                Accuracy : 0.6272          
##                  95% CI : (0.6206, 0.6337)
##     No Information Rate : 0.9578          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.0544          
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.63873         
##             Specificity : 0.62665         
##          Pos Pred Value : 0.07004         
##          Neg Pred Value : 0.97525         
##              Prevalence : 0.04217         
##          Detection Rate : 0.02694         
##    Detection Prevalence : 0.38455         
##       Balanced Accuracy : 0.63269         
##                                           
##        'Positive' Class : Yes             
## 

##           Accuracy Sensitivity Specificity    F_score Threshold       AUC
## Accuracy 0.6271551   0.6387316   0.6266454 0.04046375     0.458 0.6326885
##          False_positive_Rate True_positive_Rate
## Accuracy           0.3733546          0.6387316
rownames(model_glm_fullCustomerData_smotesampling_metrics) <- "FullData        - GLM - SMOTE-Sampling"
model_Metrics <- rbind(model_Metrics, model_glm_fullCustomerData_smotesampling_metrics)

# Accuracy    : 0.6270118
# Sensitivity : 0.6398641
# Specificity : 0.626446
#
# F: 0.04046375
#
# Area under the curve (AUC): 0.6326885

# -------------------------------- Decision Tree - Using Under-Sampling
#           Accuracy Sensitivity Specificity    F_score Threshold       AUC False_positive_Rate True_positive_Rate
# Accuracy 0.05372749   0.9852775   0.0127144 0.04046375      0.05 0.5010041           0.9852775          0.9872856
# Discarding this Under-sampling options for Decision Tree

# -------------------------------- Decision Tree - Using Over-Sampling
#           Accuracy Sensitivity Specificity    F_score Threshold       AUC False_positive_Rate True_positive_Rate
# Accuracy 0.8581594   0.1313703   0.8901576 0.04046375      0.05 0.5107639           0.1098424          0.1313703
# Discarding this over-sampling options for Decision Tree
# -------------------------------- Decision Tree - Using SMOTE

t8 <- Sys.time()
model_rpart_fullCustomerData_smote <- caret::train(Performance ~  
                          Income_imputed + 
                          No.of.months.in.current.company +
                          No.of.months.in.current.residence +
                          #Avgas.CC.Utilization.in.last.12.months_WoE +
                          Avgas.CC.Utilization.in.last.12.months_imputed +
                          #Outstanding.Balance_WoE +
                          Outstanding.Balance_imputed +
                          No.of.times.30.DPD.or.worse.in.last.6.months +
                          No.of.trades.opened.in.last.12.months,
                   data = train_data [, -1],
                   method = "rpart",
                   #preProcess = c("scale", "center"),
                   #minsplit=30, minbucket = 15, cp=0.0001,
                   tuneLength = 5,
                   trControl = trainControl(method = "cv", 
                                            number = 5, 
                                            verboseIter = TRUE,
                                            sampling = "smote"))
## + Fold1: cp=9.695e-05 
## - Fold1: cp=9.695e-05 
## + Fold2: cp=9.695e-05 
## - Fold2: cp=9.695e-05 
## + Fold3: cp=9.695e-05 
## - Fold3: cp=9.695e-05 
## + Fold4: cp=9.695e-05 
## - Fold4: cp=9.695e-05 
## + Fold5: cp=9.695e-05 
## - Fold5: cp=9.695e-05 
## Aggregating results
## Selecting tuning parameters
## Fitting cp = 0.000388 on full training set
# Fitting cp = 0.000388 on full training set
t9 <- Sys.time()
t9-t8
## Time difference of 10.41869 secs
# Time difference of 16.57429 secs

# # plot(model_rpart_fullCustomerData_smote)
# # prp(model_rpart_fullCustomerData_smote, box.palette = "Reds", tweak = 1.2)
# library(rpart.plot)
# rpart.plot(model_rpart_fullCustomerData_smote$finalModel)

test_pred_fullCustomerData_rpart_smote <- predict(model_rpart_fullCustomerData_smote, 
                  type = "prob",
                  newdata = test_data)

summary(test_pred_fullCustomerData_rpart_smote[,2])
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.1289  0.1782  0.2690  0.2766  1.0000
# Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
# 0.0000  0.1083  0.1765  0.2533  0.2500  1.0000   
cutOff <- findOptimalCutOff(test_pred_fullCustomerData_rpart_smote[,2], 0.1, 0.25)

## Optimal Cutoff =  0.191
# Optimal Cutoff =  0.206

model_rpart_fullCustomerData_smotesampling_metrics <- evaluateClassificationModel(test_pred_fullCustomerData_rpart_smote[,2],
                            test_actual_default, 
                            cutOff)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    No   Yes
##        No  11706   362
##        Yes  8350   521
##                                           
##                Accuracy : 0.5839          
##                  95% CI : (0.5772, 0.5906)
##     No Information Rate : 0.9578          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.0326          
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.59003         
##             Specificity : 0.58367         
##          Pos Pred Value : 0.05873         
##          Neg Pred Value : 0.97000         
##              Prevalence : 0.04217         
##          Detection Rate : 0.02488         
##    Detection Prevalence : 0.42366         
##       Balanced Accuracy : 0.58685         
##                                           
##        'Positive' Class : Yes             
## 

##           Accuracy Sensitivity Specificity    F_score Threshold       AUC
## Accuracy 0.5839343    0.590034   0.5836657 0.04046375     0.191 0.5868499
##          False_positive_Rate True_positive_Rate
## Accuracy           0.4163343           0.590034
rownames(model_rpart_fullCustomerData_smotesampling_metrics) <- "FullData        - RPART - SMOTE-Sampling"
model_Metrics <- rbind(model_Metrics, model_rpart_fullCustomerData_smotesampling_metrics)

# Accuracy    : 0.6015569
# Sensitivity : 0.599094
# Specificity : 0.6016653
#
# F: 0.04046375
#
# Area under the curve (AUC): 0.6003797

# -------------------------------- Random Forest - Using Under-Sampling
# Remove code snippet related to this model
#           Accuracy Sensitivity Specificity    F_score Threshold       AUC False_positive_Rate True_positive_Rate
# Accuracy 0.6067147   0.6070215   0.6067012 0.04046375     0.522 0.6068614           0.3932988          0.6070215
# Discarding this Under-sampling options for RF

# -------------------------------- Random Forest - Using Over-Sampling
# Remove code snippet related to this model
#           Accuracy Sensitivity Specificity    F_score Threshold       AUC False_positive_Rate True_positive_Rate
# Accuracy 0.5677444   0.5934315   0.5666135 0.04046375      0.05 0.5800225           0.4333865          0.5934315
# Also takes 50+ mins to build model. So, discarding RF Over-Sampling option
# Discarding this Over-sampling options for RF

# -------------------------------- Random Forest - Using SMOTE
t12 <- Sys.time()
model_rf_fullCustomerData_DemographicData_smote <- caret::train(Performance ~ 
                           Income_imputed + 
                           No.of.months.in.current.company +
                           No.of.months.in.current.residence +
                           Avgas.CC.Utilization.in.last.12.months_WoE +
                           Avgas.CC.Utilization.in.last.12.months_imputed +
                           Outstanding.Balance_WoE +
                           Outstanding.Balance_imputed +
                           No.of.times.30.DPD.or.worse.in.last.6.months +
                           No.of.trades.opened.in.last.12.months,
                        data = train_data [, -1],
                        method = "rf",
                        ntree = 1000,
                        preProcess = c("scale", "center"),
                        trControl = trainControl(method = "cv", 
          number = 5, 
          verboseIter = TRUE,
          sampling = "smote"))
## + Fold1: mtry=2 
## - Fold1: mtry=2 
## + Fold1: mtry=5 
## - Fold1: mtry=5 
## + Fold1: mtry=9 
## - Fold1: mtry=9 
## + Fold2: mtry=2 
## - Fold2: mtry=2 
## + Fold2: mtry=5 
## - Fold2: mtry=5 
## + Fold2: mtry=9 
## - Fold2: mtry=9 
## + Fold3: mtry=2 
## - Fold3: mtry=2 
## + Fold3: mtry=5 
## - Fold3: mtry=5 
## + Fold3: mtry=9 
## - Fold3: mtry=9 
## + Fold4: mtry=2 
## - Fold4: mtry=2 
## + Fold4: mtry=5 
## - Fold4: mtry=5 
## + Fold4: mtry=9 
## - Fold4: mtry=9 
## + Fold5: mtry=2 
## - Fold5: mtry=2 
## + Fold5: mtry=5 
## - Fold5: mtry=5 
## + Fold5: mtry=9 
## - Fold5: mtry=9 
## Aggregating results
## Selecting tuning parameters
## Fitting mtry = 2 on full training set
t13 <- Sys.time()
t13-t12
## Time difference of 5.685651 mins
# Time difference of 4.712344 mins
test_pred_fullCustomerData_rf_smote <- predict(model_rf_fullCustomerData_DemographicData_smote, 
       type = "prob",
       newdata = test_data)

summary(test_pred_fullCustomerData_rf_smote[,2])
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0990  0.2230  0.2459  0.3670  0.8390
# Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
# 0.0010  0.0980  0.2240  0.2435  0.3650  0.8620 
cutOff <- findOptimalCutOff(test_pred_fullCustomerData_rf_smote[,2], .001,.86)

## Optimal Cutoff =  0.279
# Optimal Cutoff =  0.279

model_rf_fullCustomerData_smotesampling_metrics <- evaluateClassificationModel(test_pred_fullCustomerData_rf_smote[,2],
                                         test_actual_default, 
                                         cutOff)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    No   Yes
##        No  12180   352
##        Yes  7876   531
##                                           
##                Accuracy : 0.607           
##                  95% CI : (0.6004, 0.6137)
##     No Information Rate : 0.9578          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.0411          
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.60136         
##             Specificity : 0.60730         
##          Pos Pred Value : 0.06316         
##          Neg Pred Value : 0.97191         
##              Prevalence : 0.04217         
##          Detection Rate : 0.02536         
##    Detection Prevalence : 0.40150         
##       Balanced Accuracy : 0.60433         
##                                           
##        'Positive' Class : Yes             
## 

##          Accuracy Sensitivity Specificity    F_score Threshold       AUC
## Accuracy 0.607049    0.601359   0.6072996 0.04046375     0.279 0.6043293
##          False_positive_Rate True_positive_Rate
## Accuracy           0.3927004           0.601359
rownames(model_rf_fullCustomerData_smotesampling_metrics) <- "FullData        - RF - SMOTE-Sampling"
model_Metrics <- rbind(model_Metrics, model_rf_fullCustomerData_smotesampling_metrics)

plot(varImp(object=model_rf_fullCustomerData_DemographicData_smote),main="Random Forest (SMOTE) - Variable Importance")

# Accuracy    : 0.6212
# Sensitivity : 0.62288
# Specificity : 0.62111
#
# F: 0.040
#
# Area under the curve (AUC): 0.622

Evaluation of Models for Final Model Selection

# Evaluate various metrics across vall models built
# Evaluating based on AUC, F-Score, Sensitivity, Specificity and Accuracy
View(model_Metrics)

# Analyse Lift, Gain and KS-Statistic metrics
model_Metrics$KSStatistic [1] <- GainLiftChart_KSStatistic(logistic_model_demographic_data_unbalanced, test_data,  "response")

## # A tibble: 10 x 6
##    bucket total totalresp Cumresp      Gain  Cumlift
##     <int> <int>     <dbl>   <dbl>     <dbl>    <dbl>
##  1      1  2094       131     131  14.83579 1.483579
##  2      2  2094       103     234  26.50057 1.325028
##  3      3  2094       105     339  38.39185 1.279728
##  4      4  2094        88     427  48.35787 1.208947
##  5      5  2094       102     529  59.90940 1.198188
##  6      6  2094        70     599  67.83692 1.130615
##  7      7  2094        88     687  77.80294 1.111471
##  8      8  2094        69     756  85.61721 1.070215
##  9      9  2094        59     815  92.29898 1.025544
## 10     10  2093        68     883 100.00000 1.000000
model_Metrics$Lift [1] <- 1.1
model_Metrics$Gain [1] <- 59.91

model_Metrics$KSStatistic [2] <- GainLiftChart_KSStatistic(logistic_model_application_and_creditdata_unbalanced, test_data,  "response")

## # A tibble: 10 x 6
##    bucket total totalresp Cumresp      Gain  Cumlift
##     <int> <int>     <dbl>   <dbl>     <dbl>    <dbl>
##  1      1  2094       181     181  20.49830 2.049830
##  2      2  2094       149     330  37.37259 1.868630
##  3      3  2094       135     465  52.66138 1.755379
##  4      4  2094       109     574  65.00566 1.625142
##  5      5  2094        88     662  74.97169 1.499434
##  6      6  2094        73     735  83.23896 1.387316
##  7      7  2094        48     783  88.67497 1.266785
##  8      8  2094        50     833  94.33749 1.179219
##  9      9  2094        32     865  97.96149 1.088461
## 10     10  2093        18     883 100.00000 1.000000
model_Metrics$Lift [2] <- 1.49
model_Metrics$Gain [2] <- 74.97

model_Metrics$KSStatistic [3] <- GainLiftChart_KSStatistic(model_glm_fullCustomerData_undersampling, test_data,  "raw")

## # A tibble: 10 x 6
##    bucket total totalresp Cumresp      Gain  Cumlift
##     <int> <int>     <int>   <int>     <dbl>    <dbl>
##  1      1  2094       124     124  14.04304 1.404304
##  2      2  2094       129     253  28.65232 1.432616
##  3      3  2094       148     401  45.41336 1.513779
##  4      4  2094       138     539  61.04190 1.526048
##  5      5  2094        95     634  71.80068 1.436014
##  6      6  2094        47     681  77.12344 1.285391
##  7      7  2094        50     731  82.78596 1.182657
##  8      8  2094        48     779  88.22197 1.102775
##  9      9  2094        50     829  93.88448 1.043161
## 10     10  2093        54     883 100.00000 1.000000
model_Metrics$Lift [3] <- 1.43
model_Metrics$Gain [3] <- 71.80

model_Metrics$KSStatistic [4] <- GainLiftChart_KSStatistic(model_glm_fullCustomerData_oversampling, test_data,  "raw")

## # A tibble: 10 x 6
##    bucket total totalresp Cumresp      Gain  Cumlift
##     <int> <int>     <int>   <int>     <dbl>    <dbl>
##  1      1  2094       125     125  14.15629 1.415629
##  2      2  2094       133     258  29.21857 1.460929
##  3      3  2094       148     406  45.97961 1.532654
##  4      4  2094       140     546  61.83465 1.545866
##  5      5  2094        90     636  72.02718 1.440544
##  6      6  2094        46     682  77.23669 1.287278
##  7      7  2094        48     730  82.67271 1.181039
##  8      8  2094        48     778  88.10872 1.101359
##  9      9  2094        49     827  93.65798 1.040644
## 10     10  2093        56     883 100.00000 1.000000
model_Metrics$Lift [4] <- 1.44
model_Metrics$Gain [4] <- 72.23

model_Metrics$KSStatistic [5] <-GainLiftChart_KSStatistic(model_glm_fullCustomerData_smote, test_data, "raw")

## # A tibble: 10 x 6
##    bucket total totalresp Cumresp      Gain  Cumlift
##     <int> <int>     <int>   <int>     <dbl>    <dbl>
##  1      1  2094       132     132  14.94904 1.494904
##  2      2  2094       163     295  33.40883 1.670442
##  3      3  2094       153     448  50.73613 1.691204
##  4      4  2094        56     504  57.07814 1.426954
##  5      5  2094        59     563  63.75991 1.275198
##  6      6  2094        54     617  69.87542 1.164590
##  7      7  2094        69     686  77.68969 1.109853
##  8      8  2094        63     749  84.82446 1.060306
##  9      9  2094        69     818  92.63873 1.029319
## 10     10  2093        65     883 100.00000 1.000000
model_Metrics$Lift [5] <- 1.27
model_Metrics$Gain [5] <- 63.75

model_Metrics$KSStatistic [6] <-GainLiftChart_KSStatistic(model_rpart_fullCustomerData_smote, test_data, "raw")

## # A tibble: 10 x 6
##    bucket total totalresp Cumresp      Gain  Cumlift
##     <int> <int>     <int>   <int>     <dbl>    <dbl>
##  1      1  2094       149     149  16.87429 1.687429
##  2      2  2094       108     257  29.10532 1.455266
##  3      3  2094        72     329  37.25934 1.241978
##  4      4  2094        78     407  46.09287 1.152322
##  5      5  2094        73     480  54.36014 1.087203
##  6      6  2094        88     568  64.32616 1.072103
##  7      7  2094        73     641  72.59343 1.037049
##  8      8  2094        78     719  81.42695 1.017837
##  9      9  2094        77     796  90.14723 1.001636
## 10     10  2093        87     883 100.00000 1.000000
model_Metrics$Lift [6] <- 1.08
model_Metrics$Gain [6] <- 54.36

model_Metrics$KSStatistic [7] <- GainLiftChart_KSStatistic(model_rf_fullCustomerData_DemographicData_smote, test_data, "raw")

## # A tibble: 10 x 6
##    bucket total totalresp Cumresp      Gain   Cumlift
##     <int> <int>     <int>   <int>     <dbl>     <dbl>
##  1      1  2094       140     140  15.85504 1.5855040
##  2      2  2094        66     206  23.32956 1.1664779
##  3      3  2094        86     292  33.06908 1.1023028
##  4      4  2094        71     363  41.10985 1.0277463
##  5      5  2094        93     456  51.64213 1.0328426
##  6      6  2094        83     539  61.04190 1.0173650
##  7      7  2094        87     626  70.89468 1.0127811
##  8      8  2094        83     709  80.29445 1.0036806
##  9      9  2094        83     792  89.69422 0.9966025
## 10     10  2093        91     883 100.00000 1.0000000
model_Metrics$Lift [7] <- 1.03
model_Metrics$Gain [7] <- 51.64
# KS-Statistic = 0.0643504
0.0643504
## [1] 0.0643504

Final Model Selection

View(model_Metrics)
print(model_Metrics)
##                                           Accuracy Sensitivity Specificity
## DemographicData - GLM - Unbalanced       0.5397583   0.5537939   0.5391404
## FullData        - GLM - Unbalanced       0.6333636   0.6172140   0.6340746
## FullData        - GLM - Under-Sampling   0.6344620   0.6228766   0.6349721
## FullData        - GLM - Over-Sampling    0.6223793   0.6387316   0.6216594
## FullData        - GLM - SMOTE-Sampling   0.6271551   0.6387316   0.6266454
## FullData        - RPART - SMOTE-Sampling 0.5839343   0.5900340   0.5836657
## FullData        - RF - SMOTE-Sampling    0.6070490   0.6013590   0.6072996
##                                             F_score Threshold       AUC
## DemographicData - GLM - Unbalanced       0.04046375     0.042 0.5464671
## FullData        - GLM - Unbalanced       0.04046375     0.049 0.6256443
## FullData        - GLM - Under-Sampling   0.04046375     0.541 0.6289243
## FullData        - GLM - Over-Sampling    0.04046375     0.532 0.6301955
## FullData        - GLM - SMOTE-Sampling   0.04046375     0.458 0.6326885
## FullData        - RPART - SMOTE-Sampling 0.04046375     0.191 0.5868499
## FullData        - RF - SMOTE-Sampling    0.04046375     0.279 0.6043293
##                                          False_positive_Rate
## DemographicData - GLM - Unbalanced                 0.4608596
## FullData        - GLM - Unbalanced                 0.3659254
## FullData        - GLM - Under-Sampling             0.3650279
## FullData        - GLM - Over-Sampling              0.3783406
## FullData        - GLM - SMOTE-Sampling             0.3733546
## FullData        - RPART - SMOTE-Sampling           0.4163343
## FullData        - RF - SMOTE-Sampling              0.3927004
##                                          True_positive_Rate KSStatistic
## DemographicData - GLM - Unbalanced                0.5537939   0.1073425
## FullData        - GLM - Unbalanced                0.6172140   0.2666422
## FullData        - GLM - Under-Sampling            0.6228766   0.2532234
## FullData        - GLM - Over-Sampling             0.6387316   0.2568917
## FullData        - GLM - SMOTE-Sampling            0.6387316   0.2153007
## FullData        - RPART - SMOTE-Sampling          0.5900340   0.1127819
## FullData        - RF - SMOTE-Sampling             0.6013590   0.0643504
##                                          Lift  Gain
## DemographicData - GLM - Unbalanced       1.10 59.91
## FullData        - GLM - Unbalanced       1.49 74.97
## FullData        - GLM - Under-Sampling   1.43 71.80
## FullData        - GLM - Over-Sampling    1.44 72.23
## FullData        - GLM - SMOTE-Sampling   1.27 63.75
## FullData        - RPART - SMOTE-Sampling 1.08 54.36
## FullData        - RF - SMOTE-Sampling    1.03 51.64
# Top 2 Models Selected
# Note : 
# -----
#        1) Discarding Random Forest as it involves high computational resources,
#           and also not providing any better formance
#        2) Discarding GLM/Unbalanced with Full Data, as well because it is trained with unbalanced data

# FullData        - GLM   - SMOTE-Sampling
# FullData        - GLM   - Over-Sampling
#
# Discarding GLM/Unbalanced model though it has highest KS-Statistic value = 0.2666422 as it is based on Unabalanced data
# GLM/Over-Sampling model has better KS-Statistic value = 0.2568917 than GLM/SMOTE model KS-Statistic value = 0.2153007

#final_Model_For_Scorecard <- model_glm_fullCustomerData_oversampling
#final_Model_For_Scorecard$finalModel

final_Model_For_Scorecard <- logistic_model_application_and_creditdata_unbalanced
# Generalized Linear Model 
# 
# 48863 samples
# 9 predictor
# 2 classes: '0', '1' 
# 
# Pre-processing: scaled (9), centered (9) 
# Resampling: Cross-Validated (5 fold) 
# Summary of sample sizes: 39090, 39091, 39091, 39090, 39090 
# Addtional sampling using up-sampling prior to pre-processing
# 
# Resampling results:
#   
#   Accuracy   Kappa     
# 0.5782698  0.04827151

# Coefficients:
#   (Intercept)                                  Income_imputed  
# -0.00420                                        -0.01460  
# No.of.months.in.current.company               No.of.months.in.current.residence  
# -0.03083                                        -0.06005  
# Avgas.CC.Utilization.in.last.12.months_WoE  Avgas.CC.Utilization.in.last.12.months_imputed  
# -0.27858                                         0.11162  
# Outstanding.Balance_WoE                     Outstanding.Balance_imputed  
# -0.12984                                        -0.02812  
# No.of.times.30.DPD.or.worse.in.last.6.months           No.of.trades.opened.in.last.12.months  
# 0.22748                                         0.14213  
# 
# Degrees of Freedom: 93599 Total (i.e. Null);  93590 Residual
# Null Deviance:        129800 
# Residual Deviance: 120700     AIC: 120700

Application Scorecard Building and Financial Benefit Analysis

App_Scorecard <- function(model,testdataset){
  
  m <- model
  score_data <- testdataset
  score_data$bad <- predict(m,type="response",newdata = score_data[,-12])
  score_data$good <- (1- score_data$bad)
  score_data$odds <- score_data$good/score_data$bad
  score_data$logodds <- log(score_data$odds)

  points0 = 400
  odds0 = 10
  pdo = 20
  factor = pdo / log(2)
  offset = points0 - factor * log( odds0 )
  
  score_data$Score <- offset + factor * score_data$logodds
  
  return(score_data)
}

testdata_scorecard <- App_Scorecard(final_Model_For_Scorecard,test_data)
rejecteddata_scorecard <- App_Scorecard(final_Model_For_Scorecard,rejected_records)

#Optimal Cutoff =  0.049  - for the unbalanced model
points0 = 400
odds0 = 10
pdo = 20
factor = pdo / log(2)
offset = points0 - factor * log( odds0 )

cutoff_prob_from_model = .049
cutoff_logodd <- log((1-cutoff_prob_from_model)/cutoff_prob_from_model)
cutoff_score <- offset + factor * cutoff_logodd
cutoff_score
## [1] 419.1333
#419.33

## rejected data analysis
nrow(rejecteddata_scorecard[(rejecteddata_scorecard$Score >= cutoff_score),])
## [1] 55
#55
boxplot(rejecteddata_scorecard$Score)

##  With this build we would have got 55 good customers who had been rejected.

## Full data analysis
fulldata <- customer_master_data
fulldata_scorecard <- App_Scorecard(final_Model_For_Scorecard,fulldata)
ggplot(fulldata_scorecard,aes(fulldata_scorecard$Score,fill=fulldata_scorecard$Performance))+ geom_histogram(binwidth = 10,colour="black")

fulldata_scorecard$predict_performance <- ifelse(fulldata_scorecard$bad>=0.049,1,0)
fulldata_scorecard$iswrong <- ifelse(fulldata_scorecard$predict_performance != fulldata_scorecard$Performance,1,0)

percent_of_wrongprediction <- (sum(fulldata_scorecard$iswrong)/nrow(fulldata_scorecard)) * 100
percent_of_wrongprediction
## [1] 36.40154
##-------------------  expected credit loss-------------------------S

#Expected loss(c1) = PD * EAD * LGD

#PD = Probability of default of each customer

#EAD = Exposure at default or oustanding

#LGD = Loss given default.

#Lets assume if recovery likelihood is 30% then LDG = 1  - 0.30 = 0.7

#Total loss expected if all customers are bad
fulldata_scorecard$expected_loss = fulldata_scorecard$bad * fulldata_scorecard$Outstanding.Balance_imputed * 0.7
# Calculated on Full data
# Total prospect loss =  2634047450
# (Prob of bad * Exposure at default * Loss given default)
# Expected loss by default customer from model 147718048
# 
# The loss amount of 147718048 can be straight away avoided by not giving loan to default customer prospects 
# However, by looking into the application score card, some customers of default category can be consider at medium risk because they fall in the boundary range. 
# This potential credit loss can be minimized by target those customer, which Credit Score falls within Good and Intermediate. 
# The verification / acquisition cost of Bad Customer can be minimized by this Model 

#Creating a dataframe for loss calculation
potential_credit_loss <- fulldata_scorecard[, c("Performance", "bad","Outstanding.Balance_imputed")]

#Subsetting for the defaulted customers
loss_default_customer <- potential_credit_loss[(potential_credit_loss$Performance == 1),]                                             

#Loss if the model is being used on the defaulted customer
loss_default_customer$loss_model <- as.integer(loss_default_customer$bad * loss_default_customer$Outstanding.Balance_imputed * 0.7)

#Calculating the total expected loss and the loss with the model.
total_expected_loss = sum(fulldata_scorecard$expected_loss)
total_extected_loss_default_cust <- sum(loss_default_customer$loss_model)

print(total_expected_loss)
## [1] 2634047450
print(total_extected_loss_default_cust)
## [1] 147718048
#auto rejection rate

auto_rejection_rate <- sum(fulldata_scorecard$predict_performance)/nrow(fulldata_scorecard)
auto_approval_rate = 1 - auto_rejection_rate
auto_approval_rate
## [1] 0.6264434
# Auto approval rate is 62.64%

# Rejected data analysis
#Number of good customers that is being rejected
nrow(rejecteddata_scorecard[(rejecteddata_scorecard$Score >= cutoff_score),])
## [1] 55
#55

boxplot(rejecteddata_scorecard$Score)

# The histograms plots indicates that the number of defaulters decreases after Cut-off Score of 419
# Even though 419 is boundary value with Good and Bad Customers, we can suggest that the boundary range of customers fall between Good and Bad. 

rejecteddata_scorecard$expected_loss = rejecteddata_scorecard$bad * rejecteddata_scorecard$Outstanding.Balance_imputed * 0.7
rejecteddata_loss <- rejecteddata_scorecard[,c("Score","bad","Outstanding.Balance_imputed")]
rejecteddata_cutoff_score <- rejecteddata_loss[(rejecteddata_loss$Score >= 419),]

loss_by_rejected_good_customer <- sum(rejecteddata_cutoff_score$Outstanding.Balance_imputed * 0.7)

# Total prospect loss =  96026810
# (Loss because of the full rejected data)
# 
# Loss due of Rejection of Good customers is 43876837
# 
# The amount of 43876837 would have been gained on using the model because it was the loss by rejection the good customers

rejecteddata_expected_loss <- sum(rejecteddata_scorecard$expected_loss)
print(rejecteddata_expected_loss)
## [1] 96026810
# 96026810
print(loss_by_rejected_good_customer)
## [1] 43876837
# 43876837